From 6346c97e10b2f568d1d5ab27fd1cb826a6df8711 Mon Sep 17 00:00:00 2001 From: imedina Date: Sun, 21 Jan 2024 02:10:16 +0000 Subject: [PATCH 001/148] download: gwas catalog fixes --- .../src/main/resources/configuration.yml | 45 ++++++++++--------- .../org/opencb/cellbase/lib/EtlCommons.java | 2 +- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index f24827532c..21a559eb76 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -45,6 +45,7 @@ server: port: "${CELLBASE.SERVER.REST.PORT}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 @@ -64,9 +65,6 @@ download: hgnc: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt version: 2023-11-01 - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz refSeqFasta: @@ -76,8 +74,6 @@ download: refSeqCdna: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz version: "1.1" lrg: @@ -88,6 +84,8 @@ download: version: "2023-11-08" geneExpressionAtlas: host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + goAnnotation: + host: http://geneontology.org/gene-associations/goa_human.gaf.gz mirbase: host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz mirbaseReadme: @@ -121,15 +119,13 @@ download: gerp: host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw version: "2023-05-17" + + ## Clinical Variant clinvar: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz version: "2023-12-01" clinvarVariation: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz clinvarSummary: @@ -140,6 +136,9 @@ download: version: "2023-12-01" clinvarEfoTerms: host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cancerHotspot: + host: https://www.cancerhotspots.org/files/hotspots_v2.xls + version: "v2" iarctp53: host: http://p53.iarc.fr/ajax/Zipper.ashx docm: @@ -154,10 +153,19 @@ download: host: http://hgdownload.cse.ucsc.edu/goldenPath genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/goldenPath + + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + version: "1.3" + cadd: +# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz + ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! + host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz + version: "1.7-pre" gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20' + host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations_ontology-annotated.tsv version: "23-12-21" hpo: ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations @@ -170,16 +178,13 @@ download: dgidb: host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv version: "2022-02-01" - cadd: - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" reactome: host: http://www.reactome.org/download/current/biopax.zip gnomadConstraints: host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz version: "2.1.1" + + ## OBO Ontologies hpoObo: host: http://purl.obolibrary.org/obo/hp.obo version: "2023-12-01" @@ -192,10 +197,8 @@ download: mondoObo: host: http://purl.obolibrary.org/obo/mondo.obo version: "2023-12-01" - goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz - revel: - host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + + ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ files: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 124ac6e6fc..f8ee4938e5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -61,7 +61,7 @@ public class EtlCommons { public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; public static final String IARCTP53_FILE = "IARC-TP53.zip"; - public static final String GWAS_FILE = "gwas_catalog.tsv"; + public static final String GWAS_FILE = "gwas-catalog-associations_ontology-annotated.tsv"; public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; public static final String DBSNP_FILE = "All.vcf.gz"; From 89264c22d04e94dd48665ddeeefb22ce9b0cd13a Mon Sep 17 00:00:00 2001 From: imedina Date: Fri, 1 Mar 2024 09:33:22 +0000 Subject: [PATCH 002/148] Update configuration.yml --- .../src/main/resources/configuration.yml | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 21a559eb76..2204acf270 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -64,15 +64,19 @@ download: host: ftp://ftp.ensemblgenomes.org/pub hgnc: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: 2023-11-01 + version: "2023-11-01" refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + version: "2023-10-11" refSeqFasta: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + version: "2023-10-11" refSeqProteinFasta: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + version: "2023-10-11" refSeqCdna: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + version: "2023-10-11" maneSelect: host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz version: "1.1" @@ -84,6 +88,7 @@ download: version: "2023-11-08" geneExpressionAtlas: host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + version: "2.0.14" goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz mirbase: @@ -99,18 +104,19 @@ download: ## Protein Data uniprot: host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2023-11-08" + version: "2024-01-24" uniprotRelNotes: host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - version: "2023-11-08" + version: "2024-01-24" interpro: host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2023-11-08" + version: "2024-01-24" interproRelNotes: host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt + version: "2024-01-24" intact: host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt - version: "2023-10-07" + version: "2024-02-16" ## Conservation Scores conservation: @@ -123,17 +129,17 @@ download: ## Clinical Variant clinvar: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz version: "2023-12-01" clinvarVariation: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/VCV_xml_old_format/ClinVarVariationRelease_2024-02.xml.gz clinvarSummary: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - version: "2023-12-01" + version: "2024-03-01" clinvarVariationAllele: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - version: "2023-12-01" + version: "2024-03-01" clinvarEfoTerms: host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv cancerHotspot: @@ -159,17 +165,16 @@ download: host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip version: "1.3" cadd: -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" + host: https://krishna.gs.washington.edu/download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + version: "1.7" gwasCatalog: ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20' - host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations_ontology-annotated.tsv - version: "23-12-21" + host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv + version: "2024-02-12" hpo: - ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations + ## NOTE: Download manually from here now: https://hpo.jax.org/app/data/annotations host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt + version: "2024-03-01" disgenet: host: https://www.disgenet.org/static/disgenet_ap1/files/downloads files: @@ -187,16 +192,16 @@ download: ## OBO Ontologies hpoObo: host: http://purl.obolibrary.org/obo/hp.obo - version: "2023-12-01" + version: "2024-03-01" goObo: host: http://purl.obolibrary.org/obo/go/go-basic.obo - version: "2023-12-01" + version: "2024-03-01" doidObo: host: http://purl.obolibrary.org/obo/doid.obo - version: "2023-12-01" + version: "2024-03-01" mondoObo: host: http://purl.obolibrary.org/obo/mondo.obo - version: "2023-12-01" + version: "2024-03-01" ## Others pubmed: From 2be5f214f8843b6c6c0a2c493f79b3fe94b053d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 10:33:58 +0100 Subject: [PATCH 003/148] core: update pubmed URLs in the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 2204acf270..3c2b6ee443 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -207,7 +207,7 @@ download: pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ files: - - pubmed22n[1..1114..4].xml.gz + - pubmed24n[1..1219..4].xml.gz pharmGKB: host: https://www.pharmgkb.org/downloads version: v1 From fe05795eeef4dddb402832ee863277904517160a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 11:10:13 +0100 Subject: [PATCH 004/148] core: update pubmed version in the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 3c2b6ee443..b1d74cfd85 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -206,6 +206,7 @@ download: ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 files: - pubmed24n[1..1219..4].xml.gz pharmGKB: From 50f7008d523d53188ab929fba113488ceb5ba56d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 11:24:57 +0100 Subject: [PATCH 005/148] core: improve Ontology downloader, #TASK-5775, #TASK-5564 --- .../java/org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/OntologyDownloadManager.java | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f8ee4938e5..10c45ae64e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -127,6 +127,7 @@ public class EtlCommons { public static final String HPO_VERSION_FILE = "hpoVersion.json"; public static final String GO_VERSION_FILE = "goVersion.json"; public static final String DO_VERSION_FILE = "doVersion.json"; + public static final String MONDO_VERSION_FILE = "mondoVersion.json"; public static final String HGMD_FILE = "hgmd.vcf"; public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 522be7b27d..e7e510fb91 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -43,24 +43,28 @@ public List download() throws IOException, InterruptedException { Files.createDirectories(oboFolder); String url = configuration.getDownload().getHpoObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.HPO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.HPO_VERSION_FILE)); url = configuration.getDownload().getGoObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.GO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.GO_VERSION_FILE)); url = configuration.getDownload().getDoidObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.DO_VERSION_FILE)); url = configuration.getDownload().getMondoObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.MONDO_VERSION_FILE)); return downloadFiles; } From a8a9328f16bd29fff27d569be6857cf97e9effd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 16:35:13 +0100 Subject: [PATCH 006/148] lib: take into account PubMed version from config file, and fix sonnar issues, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/PubMedDownloadManager.java | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 10c45ae64e..b06364b6e5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -92,6 +92,7 @@ public class EtlCommons { public static final String HGMD_DATA = "hgmd"; public static final String PUBMED_DATA = "pubmed"; + public static final String PUBMED_VERSION_FILE = PUBMED_DATA + "Version.json"; // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index b5edf0220b..e913539d5b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -27,9 +27,11 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_VERSION_FILE; + public class PubMedDownloadManager extends AbstractDownloadManager { - private static final String PUBMED_NAME = "PUBMED"; + private static final String PUBMED_NAME = "PubMed"; public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -39,7 +41,7 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto public List download() throws IOException, InterruptedException { logger.info("Downloading PubMed XML files..."); - Path pubmedFolder = downloadFolder.resolve("pubmed"); + Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); Files.createDirectories(pubmedFolder); // Downloads PubMed XML files @@ -47,17 +49,17 @@ public List download() throws IOException, InterruptedException { String regexp = configuration.getDownload().getPubmed().getFiles().get(0); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); - int start = Integer.valueOf(split[0]); - int end = Integer.valueOf(split[1]); - int padding = Integer.valueOf(split[2]); + int start = Integer.parseInt(split[0]); + int end = Integer.parseInt(split[1]); + int padding = Integer.parseInt(split[2]); - saveVersionData(EtlCommons.PUBMED_DATA, PUBMED_NAME, null, getTimeStamp(), Collections.singletonList(url), - pubmedFolder.resolve("pubmedVersion.json")); + saveVersionData(EtlCommons.PUBMED_DATA, PUBMED_NAME, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), + Collections.singletonList(url), pubmedFolder.resolve(PUBMED_VERSION_FILE)); List list = new ArrayList<>(); for (int i = start; i <= end; i++) { String filename = name[0] + String.format("%0" + padding + "d", i) + name[2]; - logger.info("\tDownloading file " + filename); + logger.info("\tDownloading file {}", filename); list.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); } return list; From f84734e9c0d238e16e13e0c576262c593ee56a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 17:25:30 +0100 Subject: [PATCH 007/148] lib: improve clinvar and gwas downloader by removing hardcode filenames and taking into account the version from config file, and fix sonnar issues, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 9 + .../lib/download/ClinicalDownloadManager.java | 180 +++++++++++------- 2 files changed, 117 insertions(+), 72 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b06364b6e5..4d516ec273 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -54,14 +54,23 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; + @Deprecated public static final String CLINVAR_VERSION = "2022.11"; + @Deprecated public static final String CLINVAR_DATE = "2022-11"; + @Deprecated public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; + @Deprecated public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; + @Deprecated public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; + @Deprecated public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; + public static final String CLINVAR_VERSION_FILENAME = "clinvarVersion.json"; public static final String IARCTP53_FILE = "IARC-TP53.zip"; + @Deprecated public static final String GWAS_FILE = "gwas-catalog-associations_ontology-annotated.tsv"; + public static final String GWAS_VERSION_FILENAME = "gwasVersion.json"; public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; public static final String DBSNP_FILE = "All.vcf.gz"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 580a855a19..1918f82be6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -29,15 +29,23 @@ import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION_FILENAME; +import static org.opencb.cellbase.lib.EtlCommons.GWAS_VERSION_FILENAME; + public class ClinicalDownloadManager extends AbstractDownloadManager { private static final String CLINVAR_NAME = "ClinVar"; private static final String GWAS_NAME = "GWAS catalog"; + /** + * @deprecated + */ + @Deprecated private static final String IARCTP53_NAME = "IARC TP53 Database"; @@ -63,39 +71,50 @@ public List downloadClinical() throws IOException, InterruptedExce logger.info("Downloading clinical information ..."); String url; + String filename; List downloadFiles = new ArrayList<>(); - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); + Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER).toAbsolutePath(); Files.createDirectories(clinicalFolder); logger.info("\t\tDownloading ClinVar files ..."); List clinvarUrls = new ArrayList<>(3); url = configuration.getDownload().getClinvar().getHost(); - - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); url = configuration.getDownload().getClinvarEfoTerms().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_EFO_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); url = configuration.getDownload().getClinvarSummary().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); url = configuration.getDownload().getClinvarVariationAllele().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); + + saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar().getVersion(), + getTimeStamp(), clinvarUrls, clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); // Gwas catalog logger.info("\t\tDownloading GWAS catalog file ..."); DownloadProperties.URLProperties gwasCatalog = configuration.getDownload().getGwasCatalog(); url = gwasCatalog.getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), - Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); + Collections.singletonList(url), clinicalFolder.resolve(GWAS_VERSION_FILENAME)); // List hgvsList = getDocmHgvsList(); // if (!hgvsList.isEmpty()) { @@ -139,87 +158,110 @@ public List downloadClinical() throws IOException, InterruptedExce // Collections.singletonList(url), clinicalFolder.resolve("iarctp53Version.json")); // } - if (Files.notExists(clinicalFolder.resolve("clinvar_chunks"))) { - Files.createDirectories(clinicalFolder.resolve("clinvar_chunks")); - splitClinvar(clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE), clinicalFolder.resolve("clinvar_chunks")); + final String chunkDir = "clinvar_chunks"; + if (Files.notExists(clinicalFolder.resolve(chunkDir))) { + Files.createDirectories(clinicalFolder.resolve(chunkDir)); + filename = Paths.get(configuration.getDownload().getClinvar().getHost()).getFileName().toString(); + logger.info("\t\tSplitting {} int {} ...", clinicalFolder.resolve(filename), clinicalFolder.resolve(chunkDir)); + splitClinvar(clinicalFolder.resolve(filename), clinicalFolder.resolve(chunkDir)); } return downloadFiles; } - return null; + return Collections.emptyList(); } private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + pw.print(""); + pw.close(); + chunk++; + } } } + pw.print(""); + pw.close(); } - pw.print(""); - pw.close(); - br.close(); } + /** + * @deprecated + * @param docmIndexHtml + * @return + */ + @Deprecated private String getDocmVersion(Path docmIndexHtml) { return getVersionFromVersionLine(docmIndexHtml, " hgvsList, Path path) throws IOException, InterruptedException { - try (BufferedWriter bufferedWriter = Files.newBufferedWriter(path)) { - Client client = ClientBuilder.newClient(); - WebTarget restUrlBase = client - .target(URI.create(configuration.getDownload().getDocm().getHost() + "v1/variants")); - - logger.info("Querying DOCM REST API to get detailed data for all their variants"); - int counter = 0; - for (String hgvs : hgvsList) { - WebTarget callUrl = restUrlBase.path(hgvs + ".json"); - String jsonString = callUrl.request().get(String.class); - bufferedWriter.write(jsonString + "\n"); - - if (counter % 10 == 0) { - logger.info("{} DOCM variants saved", counter); - } - // Wait 1/3 of a second to avoid saturating their REST server - also avoid getting banned - Thread.sleep(300); - - counter++; - } - logger.info("Finished. {} DOCM variants saved at {}", counter, path); - } - } - - /** - * @deprecated - * @return - * @throws IOException - */ - @Deprecated - private List getDocmHgvsList() throws IOException { - Client client = ClientBuilder.newClient(); - WebTarget restUrl = client - .target(URI.create(configuration.getDownload().getDocm().getHost() + "v1/variants.json")); - - String jsonString; - logger.info("Getting full list of DOCM hgvs from: {}", restUrl.getUri().toURL()); - jsonString = restUrl.request().get(String.class); - - List> responseMap = parseResult(jsonString); - List hgvsList = new ArrayList<>(responseMap.size()); - for (Map document : responseMap) { - if (document.containsKey("reference_version") - && document.get("reference_version").equalsIgnoreCase(assemblyConfiguration.getName())) { - hgvsList.add(document.get("hgvs")); - } - } - logger.info("{} hgvs found", hgvsList.size()); - - return hgvsList; - } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CoreDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CoreDownloadManager.java deleted file mode 100644 index aca27ff2e8..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CoreDownloadManager.java +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.download; - -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.lang.StringUtils; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; - -import java.io.*; -import java.net.URI; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; - -@Deprecated -public class CoreDownloadManager extends DownloadManager { - - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String UNIPROT_NAME = "UniProt"; - private static final String INTACT_NAME = "IntAct"; - private static final String INTERPRO_NAME = "InterPro"; - private static final String GERP_NAME = "GERP++"; - private static final String PHASTCONS_NAME = "PhastCons"; - private static final String PHYLOP_NAME = "PhyloP"; - private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; - private static final String HPO_NAME = "HPO"; - private static final String DISGENET_NAME = "DisGeNET"; - private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; - private static final String DGIDB_NAME = "DGIdb"; - private static final String GNOMAD_NAME = "gnomAD"; - - private static final HashMap GENE_UNIPROT_XREF_FILES = new HashMap() { - { - put("Homo sapiens", "HUMAN_9606_idmapping_selected.tab.gz"); - put("Mus musculus", "MOUSE_10090_idmapping_selected.tab.gz"); - put("Rattus norvegicus", "RAT_10116_idmapping_selected.tab.gz"); - put("Danio rerio", "DANRE_7955_idmapping_selected.tab.gz"); - put("Drosophila melanogaster", "DROME_7227_idmapping_selected.tab.gz"); - put("Saccharomyces cerevisiae", "YEAST_559292_idmapping_selected.tab.gz"); - } - }; - - public CoreDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) - throws IOException, CellBaseException { - super(species, assembly, targetDirectory, configuration); - } - - public CoreDownloadManager(CellBaseConfiguration configuration, Path targetDirectory, SpeciesConfiguration speciesConfiguration, - SpeciesConfiguration.Assembly assembly) throws IOException, CellBaseException { - super(configuration, targetDirectory, speciesConfiguration, assembly); - } - - public void downloadReferenceGenome() throws IOException, InterruptedException { - logger.info("Downloading genome information ..."); - Path sequenceFolder = downloadFolder.resolve("genome"); - Files.createDirectories(sequenceFolder); - - // Reference genome sequences are downloaded from Ensembl - // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead - String url = ensemblHostUrl + "/" + ensemblRelease; - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - url = url + "/fasta/" + speciesShortName + "/dna/*.dna.primary_assembly.fa.gz"; - } else { - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - url = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } - url = url + "/fasta/"; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - url = url + speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - url = url + speciesShortName + "/dna/*.dna.toplevel.fa.gz"; - } - - String outputFileName = StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName() + ".fa.gz"; - Path outputPath = sequenceFolder.resolve(outputFileName); - downloadFile(url, outputPath.toString()); - logger.info("Saving reference genome version data at {}", sequenceFolder.resolve("genomeVersion.json")); - saveVersionData(EtlCommons.GENOME_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve("genomeVersion.json")); - } - - public void downloadEnsemblGene()throws IOException, InterruptedException { - logger.info("Downloading gene information ..."); - Path geneFolder = downloadFolder.resolve("gene"); - Files.createDirectories(geneFolder); - - downloadEnsemblData(geneFolder); - downloadDrugData(geneFolder); - downloadGeneUniprotXref(geneFolder); - downloadGeneExpressionAtlas(geneFolder); - downloadGeneDiseaseAnnotation(geneFolder); - downloadGnomadConstraints(geneFolder); - downloadGO(geneFolder); - // FIXME -// runGeneExtraInfo(geneFolder); - } - - private void downloadGO(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading go annotation..."); - String url = configuration.getDownload().getGoAnnotation().getHost(); - downloadFile(url, geneFolder.resolve("goa_human.gaf.gz").toString()); - saveVersionData(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, null, getTimeStamp(), Collections.singletonList(url), - buildFolder.resolve("goAnnotationVersion.json")); - } - } - - public void downloadObo() throws IOException, InterruptedException { - logger.info("Downloading obo files ..."); - - Path oboFolder = downloadFolder.resolve("obo"); - Files.createDirectories(oboFolder); - - String url = configuration.getDownload().getHpoObo().getHost(); - downloadFile(url, oboFolder.resolve("hp.obo").toString()); - - url = configuration.getDownload().getGoObo().getHost(); - downloadFile(url, oboFolder.resolve("go-basic.obo").toString()); - } - - private void downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading gnomAD constraints data..."); - String url = configuration.getDownload().getGnomadConstraints().getHost(); - downloadFile(url, geneFolder.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz").toString()); - saveVersionData(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload(). - getGnomadConstraints().getVersion(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve("gnomadVersion.json")); - } - } - private void downloadDrugData(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading drug-gene data..."); - String url = configuration.getDownload().getDgidb().getHost(); - downloadFile(url, geneFolder.resolve("dgidb.tsv").toString()); - saveVersionData(EtlCommons.GENE_DATA, DGIDB_NAME, null, getTimeStamp(), Collections.singletonList(url), - buildFolder.resolve("dgidbVersion.json")); - } - } - - private void downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene Ensembl data (gtf, pep, cdna, motifs) ..."); - List downloadedUrls = new ArrayList<>(4); - - String ensemblHost = ensemblHostUrl + "/" + ensemblRelease; - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - ensemblHost = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } - - String bacteriaCollectionPath = ""; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - bacteriaCollectionPath = speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - - // Ensembl leaves now several GTF files in the FTP folder, we need to build a more accurate URL - // to download the correct GTF file. - String version = ensemblRelease.split("-")[1]; - String url = ensemblHost + "/gtf/" + bacteriaCollectionPath + speciesShortName + "/*" + version + ".gtf.gz"; - String fileName = geneFolder.resolve(speciesShortName + ".gtf.gz").toString(); - downloadFile(url, fileName); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + bacteriaCollectionPath + speciesShortName + "/pep/*.pep.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".pep.all.fa.gz").toString(); - downloadFile(url, fileName); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + bacteriaCollectionPath + speciesShortName + "/cdna/*.cdna.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".cdna.all.fa.gz").toString(); - downloadFile(url, fileName); - downloadedUrls.add(url); - - //ftp://ftp.ensembl.org/pub/release-99/regulation/homo_sapiens/MotifFeatures/Homo_sapiens.GRCh38.motif_features.gff.gz -// url = ensemblHost + "/regulation/" + speciesShortName + "/MotifFeatures/*.motif_features.gff.gz"; -// Path outputFile = geneFolder.resolve("motif_features.gff.gz"); -// downloadFile(url, outputFile.toString()); -// downloadedUrls.add(url); - - - saveVersionData(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, - buildFolder.resolve("ensemblCoreVersion.json")); - } - - private void downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading UniProt ID mapping ..."); - - if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/" - + GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName()); - downloadFile(geneGtfUrl, geneFolder.resolve("idmapping_selected.tab.gz").toString()); - downloadFile(getUniProtReleaseNotesUrl(), geneFolder.resolve("uniprotRelnotes.txt").toString()); - - saveVersionData(EtlCommons.GENE_DATA, UNIPROT_NAME, - getUniProtRelease(geneFolder.resolve("uniprotRelnotes.txt").toString()), getTimeStamp(), - Collections.singletonList(geneGtfUrl), buildFolder.resolve("uniprotXrefVersion.json")); - } - } - - private String getUniProtRelease(String relnotesFilename) { - Path path = Paths.get(relnotesFilename); - Files.exists(path); - try { - // The first line at the relnotes.txt file contains the UniProt release - BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset()); - String release = reader.readLine().split(" ")[2]; - reader.close(); - return release; - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - private String getUniProtReleaseNotesUrl() { - return URI.create(configuration.getDownload().getGeneUniprotXref().getHost()).resolve("../../../").toString() - + "/relnotes.txt"; - } - - private void downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene expression atlas ..."); - - String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost(); - downloadFile(geneGtfUrl, geneFolder.resolve("allgenes_updown_in_organism_part.tab.gz").toString()); - - saveVersionData(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, getGeneExpressionAtlasVersion(), getTimeStamp(), - Collections.singletonList(geneGtfUrl), buildFolder.resolve("geneExpressionAtlasVersion.json")); - - } - - private String getGeneExpressionAtlasVersion() { - return FilenameUtils.getBaseName(configuration.getDownload().getGeneExpressionAtlas().getHost()) - .split("_")[5].replace(".tab", ""); - } - - private void downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene disease annotation ..."); - - String host = configuration.getDownload().getHpo().getHost(); - String fileName = StringUtils.substringAfterLast(host, "/"); - downloadFile(host, geneFolder.resolve(fileName).toString()); - saveVersionData(EtlCommons.GENE_DATA, HPO_NAME, null, getTimeStamp(), Collections.singletonList(host), - buildFolder.resolve("hpoVersion.json")); - - host = configuration.getDownload().getDisgenet().getHost(); - List files = configuration.getDownload().getDisgenet().getFiles(); - for (String file : files) { - String outputFile = file.equalsIgnoreCase("readme.txt") ? "disgenetReadme.txt" : file; - downloadFile(host + "/" + file, geneFolder.resolve(outputFile).toString()); - } - - saveVersionData(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, - getVersionFromVersionLine(geneFolder.resolve("disgenetReadme.txt"), "(version"), getTimeStamp(), - Collections.singletonList(host), buildFolder.resolve("disgenetVersion.json")); - } - - private void runGeneExtraInfo(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene extra info ..."); - - String geneExtraInfoLogFile = geneFolder.resolve("gene_extra_info.log").toString(); - List args = new ArrayList<>(); - args.addAll(Arrays.asList("--species", speciesConfiguration.getScientificName(), "--assembly", assemblyConfiguration.getName(), - "--outdir", geneFolder.toAbsolutePath().toString(), - "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs())); - - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration) - && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) { - args.add("--phylo"); - args.add("no-vertebrate"); - } - - File ensemblScriptsFolder = new File(System.getProperty("basedir") + "/bin/ensembl-scripts/"); - - // run gene_extra_info.pl - boolean geneExtraInfoDownloaded = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, - "./gene_extra_info.pl", - args, - geneExtraInfoLogFile); - - // check output - if (geneExtraInfoDownloaded) { - logger.info("Gene extra files created OK"); - } else { - logger.error("Gene extra info for " + speciesConfiguration.getScientificName() + " cannot be downloaded"); - } - } - - /** - * This method downloads Gerp, PhastCons and PhyloP data from UCSC for Human and Mouse species. - - * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files - */ - public void downloadConservation() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "conservation")) { - return; - } - - logger.info("Downloading conservation information ..."); - Path conservationFolder = downloadFolder.resolve("conservation"); - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve("phastCons")); - Files.createDirectories(conservationFolder.resolve("phylop")); - Files.createDirectories(conservationFolder.resolve("gerp")); - - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; - - if (assemblyConfiguration.getName().equalsIgnoreCase("GRCh38")) { - logger.info("Downloading GERP++ ..."); - downloadFile(configuration.getDownload().getGerp().getHost(), - conservationFolder.resolve(EtlCommons.GERP_SUBDIRECTORY + "/" + EtlCommons.GERP_FILE).toAbsolutePath().toString()); - saveVersionData(EtlCommons.CONSERVATION_DATA, GERP_NAME, null, getTimeStamp(), - Collections.singletonList(configuration.getDownload().getGerp().getHost()), - buildFolder.resolve("gerpVersion.json")); - - logger.info("Downloading phastCons and PhyloP ..."); - String url = configuration.getDownload().getConservation().getHost() + "/hg38"; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons100way/hg38.100way.phastCons/chr" + chromosome + ".phastCons100way.wigFix.gz"; - downloadFile(phastConsUrl, conservationFolder.resolve("phastCons").resolve("chr" + chromosome - + ".phastCons100way.wigFix.gz").toString()); - phastconsUrls.add(phastConsUrl); - - String phyloPUrl = url + "/phyloP100way/hg38.100way.phyloP100way/chr" + chromosome + ".phyloP100way.wigFix.gz"; - downloadFile(phyloPUrl, conservationFolder.resolve("phylop").resolve("chr" + chromosome - + ".phyloP100way.wigFix.gz").toString()); - phyloPUrls.add(phyloPUrl); - } - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, null, getTimeStamp(), phastconsUrls, - buildFolder.resolve("phastConsVersion.json")); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, null, getTimeStamp(), phyloPUrls, - buildFolder.resolve("phyloPVersion.json")); - } - } - - if (speciesConfiguration.getScientificName().equals("Mus musculus")) { - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve("phastCons")); - Files.createDirectories(conservationFolder.resolve("phylop")); - - String url = configuration.getDownload().getConservation().getHost() + "/mm10"; - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "X", "Y", "M", }; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; - downloadFile(phastConsUrl, conservationFolder.resolve("phastCons").resolve("chr" + chromosome - + ".phastCons60way.wigFix.gz").toString()); - phastconsUrls.add(phastConsUrl); - String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; - downloadFile(phyloPUrl, conservationFolder.resolve("phylop").resolve("chr" + chromosome - + ".phyloP60way.wigFix.gz").toString()); - phyloPUrls.add(phyloPUrl); - } - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, null, getTimeStamp(), phastconsUrls, - buildFolder.resolve("phastConsVersion.json")); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, null, getTimeStamp(), phyloPUrls, - buildFolder.resolve("phastConsVersion.json")); - } - } - - - /** - * This method downloads UniProt, IntAct and Interpro data from EMBL-EBI. - * - * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files - */ - public void downloadProtein() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "protein")) { - return; - } - logger.info("Downloading protein information ..."); - Path proteinFolder = downloadFolder.resolve("protein"); - Files.createDirectories(proteinFolder); - - String url = configuration.getDownload().getUniprot().getHost(); - downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString()); - String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost(); - downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString()); - Files.createDirectories(proteinFolder.resolve("uniprot_chunks")); - splitUniprot(proteinFolder.resolve("uniprot_sprot.xml.gz"), proteinFolder.resolve("uniprot_chunks")); - saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1), - getTimeStamp(), Collections.singletonList(url), buildFolder.resolve("uniprotVersion.json")); - -// url = configuration.getDownload().getIntact().getHost(); -// downloadFile(url, proteinFolder.resolve("intact.txt").toString()); -// saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url), -// proteinFolder.resolve("intactVersion.json")); -// -// url = configuration.getDownload().getInterpro().getHost(); -// downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString()); -// relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); -// downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString()); -// saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), -// getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json")); - } - - private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } - } - pw.print(""); - pw.close(); - br.close(); - } - -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadManager.java deleted file mode 100644 index ab1d090294..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadManager.java +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.download; - -import com.beust.jcommander.ParameterException; -import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectReader; -import com.fasterxml.jackson.databind.ObjectWriter; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.Timestamp; -import java.text.SimpleDateFormat; -import java.time.LocalDateTime; -import java.util.*; - -@Deprecated -public class DownloadManager { - - - private static final String CADD_NAME = "CADD"; - private static final String DGV_NAME = "DGV"; -// private static final String GWAS_NAME = "Gwas Catalog"; -// private static final String DBSNP_NAME = "dbSNP"; -// private static final String REACTOME_NAME = "Reactome"; - - private static final String GNOMAD_NAME = "gnomAD"; - - protected String species; - protected String assembly; - protected Path outdir; - protected CellBaseConfiguration configuration; - - protected SpeciesConfiguration speciesConfiguration; - protected String speciesShortName; - protected String ensemblHostUrl; - protected SpeciesConfiguration.Assembly assemblyConfiguration; - protected String ensemblVersion; - protected String ensemblRelease; - protected Path downloadFolder; - protected Path buildFolder; // /_/generated-json - protected Logger logger; - - public DownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) - throws IOException, CellBaseException { - this.species = species; - this.assembly = assembly; - this.outdir = outdir; - this.configuration = configuration; - - this.init(); - } - - @Deprecated - public DownloadManager(CellBaseConfiguration configuration, Path targetDirectory, SpeciesConfiguration speciesConfiguration, - SpeciesConfiguration.Assembly assembly) throws IOException { - logger = LoggerFactory.getLogger(this.getClass()); - - this.configuration = configuration; - this.speciesConfiguration = speciesConfiguration; -// assemblyName = assembly.getName(); - - // Output folder creation - speciesShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); - // /_ - Path speciesFolder = targetDirectory.resolve(speciesShortName + "_" + assembly.getName().toLowerCase()); - // /_/download - downloadFolder = targetDirectory.resolve(speciesFolder + "/download"); - makeDir(downloadFolder); - - ensemblHostUrl = getEnsemblURL(speciesConfiguration); - ensemblVersion = assembly.getEnsemblVersion(); - ensemblRelease = "release-" + ensemblVersion.split("_")[0]; - } - - private void init() throws CellBaseException, IOException { - logger = LoggerFactory.getLogger(this.getClass()); - - // Check Species - this.speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); - if (speciesConfiguration == null) { - throw new CellBaseException("Invalid species: '" + species + "'"); - } - this.speciesShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); - this.ensemblHostUrl = getEnsemblURL(speciesConfiguration); - - // Check assembly and get Ensembl version - if (StringUtils.isEmpty(assembly)) { - this.assemblyConfiguration = SpeciesUtils.getDefaultAssembly(speciesConfiguration); - } else { - this.assemblyConfiguration = SpeciesUtils.getAssembly(speciesConfiguration, assembly); - } - if (assemblyConfiguration == null) { - throw new CellBaseException("Invalid assembly: '" + assembly + "'"); - } - this.ensemblVersion = assemblyConfiguration.getEnsemblVersion(); - this.ensemblRelease = "release-" + ensemblVersion.split("_")[0]; - - // Prepare outdir - Path speciesFolder = outdir.resolve(speciesShortName + "_" + assemblyConfiguration.getName().toLowerCase()); - downloadFolder = outdir.resolve(speciesFolder + "/download"); - Files.createDirectories(downloadFolder); - - // /_/generated_json - buildFolder = outdir.resolve(speciesFolder + "/generated_json"); - Files.createDirectories(buildFolder); - - logger.info("Processing species " + speciesConfiguration.getScientificName()); - } - - @Deprecated - public DownloadFile downloadStructuralVariants() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "svs")) { - return null; - } - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading DGV data ..."); - - Path structuralVariantsFolder = downloadFolder.resolve(EtlCommons.STRUCTURAL_VARIANTS_FOLDER); - Files.createDirectories(structuralVariantsFolder); - String sourceFilename = (assemblyConfiguration.getName().equalsIgnoreCase("grch37") ? "GRCh37_hg19" : "GRCh38_hg38") - + "_variants_2016-05-15.txt"; - String url = configuration.getDownload().getDgv().getHost() + "/" + sourceFilename; - saveVersionData(EtlCommons.STRUCTURAL_VARIANTS_DATA, DGV_NAME, getDGVVersion(sourceFilename), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DGV_VERSION_FILE)); - return downloadFile(url, structuralVariantsFolder.resolve(EtlCommons.DGV_FILE).toString()); - } - return null; - } - - private String getDGVVersion(String sourceFilename) { - return sourceFilename.split("\\.")[0].split("_")[3]; - } - - protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { - boolean hasInfo = true; - if (sp.getData() == null || !sp.getData().contains(info)) { - logger.warn("Species '{}' has no '{}' information available to download", sp.getScientificName(), info); - hasInfo = false; - } - return hasInfo; - } - - protected String getTimeStamp() { - return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); - } - - protected void saveVersionData(String data, String source, String version, String date, List url, Path outputFilePath) - throws IOException { - Map versionDataMap = new HashMap<>(); - versionDataMap.put("data", data); - versionDataMap.put("source", source); - versionDataMap.put("version", version); - versionDataMap.put("downloadDate", date); - versionDataMap.put("uRL", url); - - ObjectMapper jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.writeValue(outputFilePath.toFile(), versionDataMap); - } - - protected String getLine(Path readmePath, int lineNumber) { - Files.exists(readmePath); - try { - BufferedReader reader = Files.newBufferedReader(readmePath, Charset.defaultCharset()); - String line = null; - for (int i = 0; i < lineNumber; i++) { - line = reader.readLine(); - } - reader.close(); - return line; - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - protected List> parseResult(String json) throws IOException { - ObjectMapper jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - ObjectReader reader = jsonObjectMapper - .readerFor(jsonObjectMapper.getTypeFactory().constructCollectionType(List.class, Map.class)); - return reader.readValue(json); - } - - protected String getPhylo(SpeciesConfiguration sp) { - if (configuration.getSpecies().getVertebrates().contains(sp)) { - return "vertebrates"; - } else if (configuration.getSpecies().getMetazoa().contains(sp)) { - return "metazoa"; - } else if (configuration.getSpecies().getFungi().contains(sp)) { - return "fungi"; - } else if (configuration.getSpecies().getProtist().contains(sp)) { - return "protists"; - } else if (configuration.getSpecies().getPlants().contains(sp)) { - return "plants"; - } else if (configuration.getSpecies().getVirus().contains(sp)) { - return "virus"; - } else if (configuration.getSpecies().getBacteria().contains(sp)) { - return "bacteria"; - } else { - throw new ParameterException("Species " + sp.getScientificName() + " not associated to any phylo in the configuration file"); - } - } - - public DownloadFile downloadCaddScores() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "variation_functional_score")) { - return null; - } - if (speciesConfiguration.getScientificName().equals("Homo sapiens") && assemblyConfiguration.getName().equalsIgnoreCase("GRCh37")) { - logger.info("Downloading CADD scores information ..."); - - Path variationFunctionalScoreFolder = downloadFolder.resolve("variation_functional_score"); - Files.createDirectories(variationFunctionalScoreFolder); - - // Downloads CADD scores - String url = configuration.getDownload().getCadd().getHost(); - - saveVersionData(EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, CADD_NAME, url.split("/")[5], getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve("caddVersion.json")); - return downloadFile(url, variationFunctionalScoreFolder.resolve("whole_genome_SNVs.tsv.gz").toString()); - } - return null; - } - - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException { - return downloadFile(url, outputFileName, null); - } - -// protected void downloadFiles(String host, List fileNames) throws IOException, InterruptedException { -// downloadFiles(host, fileNames, fileNames); -// } - -// protected void downloadFiles(String host, List fileNames, List ouputFileNames) -// throws IOException, InterruptedException { -// for (int i = 0; i < fileNames.size(); i++) { -// downloadFile(host + "/" + fileNames.get(i), ouputFileNames.get(i), null); -// } -// } - - protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) - throws IOException, InterruptedException { - DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); - Long startTime = System.currentTimeMillis(); - if (Paths.get(outputFileName).toFile().exists()) { - logger.warn("File '{}' is already downloaded", outputFileName); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, "File '" + outputFileName + "' is already downloaded", true); - } else { - final String outputLog = outputFileName + ".log"; - List wgetArgs = new ArrayList<>(Arrays.asList("--tries=10", url, "-O", outputFileName, "-o", outputLog)); - if (wgetAdditionalArgs != null && !wgetAdditionalArgs.isEmpty()) { - wgetArgs.addAll(wgetAdditionalArgs); - } - boolean downloaded = EtlCommons.runCommandLineProcess(null, "wget", wgetArgs, outputLog); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, outputLog, downloaded); - } - downloadFileInfo.setElapsedTime(startTime, System.currentTimeMillis()); - return downloadFileInfo; - } - - private void setDownloadStatusAndMessage(String outputFileName, DownloadFile downloadFile, String outputLog, boolean downloaded) { - if (downloaded) { - boolean validFileSize = validateDownloadFile(downloadFile, outputFileName, outputLog); - if (validFileSize) { - downloadFile.setStatus(DownloadFile.Status.OK); - downloadFile.setMessage("File downloaded successfully"); - } else { - downloadFile.setStatus(DownloadFile.Status.ERROR); - downloadFile.setMessage("Expected downloaded file size " + downloadFile.getExpectedFileSize() - + ", Actual file size " + downloadFile.getActualFileSize()); - } - } else { - downloadFile.setMessage("See full error message in " + outputLog); - downloadFile.setStatus(DownloadFile.Status.ERROR); - // because we use the -O flag, a file will be written, even on error. See #467 -// Files.deleteIfExists((new File(outputFileName)).toPath()); - } - } - - public void writeDownloadLogFile(List downloadFiles) throws IOException { - ObjectMapper mapper = new ObjectMapper(); - ObjectWriter writer = mapper.writer(new DefaultPrettyPrinter()); - writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles); - } - - private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) { - long expectedFileSize = getExpectedFileSize(outputFileLog); - long actualFileSize = FileUtils.sizeOf(new File(outputFileName)); - downloadFile.setActualFileSize(actualFileSize); - downloadFile.setExpectedFileSize(expectedFileSize); - return expectedFileSize == actualFileSize; - } - - private int getExpectedFileSize(String outputFileLog) { - try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { - String line = null; - while ((line = reader.readLine()) != null) { - // looking for: Length: 13846591 (13M) - if (line.startsWith("Length:")) { - String[] parts = line.split("\\s"); - return Integer.parseInt(parts[1]); - } - } - } catch (Exception e) { - System.err.println(e); - } - return 0; - } - - protected String getVersionFromVersionLine(Path path, String tag) { - Files.exists(path); - try { - BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset()); - String line = reader.readLine(); - // There shall be a line at the README.txt containing the version. - // e.g. The files in the current directory contain the data corresponding to the latest release - // (version 4.0, April 2016). ... - while (line != null) { - // tag specifies a certain string that must be found within the line supposed to contain the version - // info - if (line.contains(tag)) { - String version = line.split("\\(")[1].split("\\)")[0]; - reader.close(); - return version; - } - line = reader.readLine(); - } - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - @Deprecated - private void makeDir(Path folderPath) throws IOException { - if (!Files.exists(folderPath)) { - Files.createDirectories(folderPath); - } - } - - @Deprecated - private String getEnsemblURL(SpeciesConfiguration sp) { - // We need to find which is the correct Ensembl host URL. - // This can different depending on if is a vertebrate species. - String ensemblHostUrl; - if (configuration.getSpecies().getVertebrates().contains(sp)) { - ensemblHostUrl = configuration.getDownload().getEnsembl().getUrl().getHost(); - } else { - ensemblHostUrl = configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); - } - return ensemblHostUrl; - } -} - - diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 3f90493855..d66f149c04 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -127,7 +127,7 @@ private List downloadEnsemblData(Path geneFolder) throws IOExcepti downloadFiles.add(downloadFile(url, fileName)); downloadedUrls.add(url); - saveVersionData(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, + saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, geneFolder.resolve(ENSEMBL_CORE_VERSION_FILENAME)); return downloadFiles; @@ -179,7 +179,7 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope String version = urlProperties.getVersion(); String filename = getUrlFilename(url); Path outputPath = refSeqFolder.resolve(filename); - saveVersionData(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), + saveDataSource(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), refSeqFolder.resolve(versionFilename)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); @@ -190,7 +190,7 @@ private DownloadFile downloadMane(Path geneFolder) throws IOException, Interrupt if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading MANE Select ..."); String url = configuration.getDownload().getManeSelect().getHost(); - saveVersionData(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(MANE_SELECT_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -204,7 +204,7 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading LRG data ..."); String url = configuration.getDownload().getLrg().getHost(); - saveVersionData(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(LRG_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -218,7 +218,7 @@ private DownloadFile downloadHgnc(Path geneFolder) throws IOException, Interrupt if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading HGNC data ..."); String url = configuration.getDownload().getHgnc().getHost(); - saveVersionData(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), + saveDataSource(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(HGNC_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -232,7 +232,7 @@ private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading Cancer Hotspot ..."); String url = configuration.getDownload().getCancerHotspot().getHost(); - saveVersionData(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(CANCER_HOTSPOT_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -246,7 +246,7 @@ private DownloadFile downloadGO(Path geneFolder) throws IOException, Interrupted if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading GO annotation..."); String url = configuration.getDownload().getGoAnnotation().getHost(); - saveVersionData(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GO_ANNOTATION_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -260,7 +260,7 @@ private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOExcepti if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading gnomAD constraints data..."); String url = configuration.getDownload().getGnomadConstraints().getHost(); - saveVersionData(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GNOMAD_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -274,7 +274,7 @@ private DownloadFile downloadDrugData(Path geneFolder) throws IOException, Inter if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading drug-gene data..."); String url = configuration.getDownload().getDgidb().getHost(); - saveVersionData(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), + saveDataSource(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DGIDB_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -291,7 +291,7 @@ private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException String filename = GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName()); String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/" + filename; - saveVersionData(EtlCommons.GENE_DATA, UNIPROT_NAME, + saveDataSource(EtlCommons.GENE_DATA, UNIPROT_NAME, configuration.getDownload().getGeneUniprotXref().getVersion(), getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(UNIPROT_XREF_VERSION_FILENAME)); @@ -306,7 +306,7 @@ private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException { logger.info("Downloading gene expression atlas ..."); String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost(); - saveVersionData(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(GENE_EXPRESSION_ATLAS_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(geneGtfUrl)); @@ -322,7 +322,7 @@ private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOExc configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); String url = configuration.getDownload().getDisgenet().getHost(); - saveVersionData(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), + saveDataSource(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DISGINET_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index e7e510fb91..70fbc6f6a1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -17,6 +17,7 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; @@ -27,6 +28,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class OntologyDownloadManager extends AbstractDownloadManager { public OntologyDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -34,37 +37,34 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec super(species, assembly, targetDirectory, configuration); } - public List download() throws IOException, InterruptedException { - logger.info("Downloading OBO files ..."); + logger.info("Downloading {} files ...", ONTOLOGY_DATA); - List downloadFiles = new ArrayList<>(); - Path oboFolder = downloadFolder.resolve("ontology"); + Path oboFolder = downloadFolder.resolve(ONTOLOGY_FOLDER_NAME); Files.createDirectories(oboFolder); - String url = configuration.getDownload().getHpoObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.HPO_VERSION_FILE)); + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); + + // HPO + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_NAME, ONTOLOGY_DATA, + HPO_OBO_FILE_ID, HPO_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); - url = configuration.getDownload().getGoObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.GO_VERSION_FILE)); + // GO + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoObo(), GO_OBO_NAME, ONTOLOGY_DATA, + GO_OBO_FILE_ID, GO_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); - url = configuration.getDownload().getDoidObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.DO_VERSION_FILE)); + // DOID + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_NAME, ONTOLOGY_DATA, + DOID_OBO_FILE_ID, DOID_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); - url = configuration.getDownload().getMondoObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.MONDO_VERSION_FILE)); + // Mondo + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_NAME, ONTOLOGY_DATA, + MONDO_OBO_FILE_ID, MONDO_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 274f6c62a7..812dcd996a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -30,6 +30,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Map; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -49,7 +50,9 @@ public List download() throws IOException, InterruptedException { List urls = new ArrayList<>(); List downloadFiles = new ArrayList<>(); - for (String url : pharmGKB.getFiles()) { + String host = pharmGKB.getHost(); + for (Map.Entry entry : pharmGKB.getFiles().entrySet()) { + String url = host + entry.getValue(); urls.add(url); Path downloadedFileName = Paths.get(new URL(url).getPath()).getFileName(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java index d7c924afa1..3bc97b1824 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java @@ -26,7 +26,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; -import org.opencb.cellbase.core.models.DataReleaseSource; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.impl.core.ReleaseMongoDBAdaptor; @@ -107,7 +107,7 @@ public DataRelease get(int release) throws CellBaseException { } } } - throw new CellBaseException("Data release '" + release + "' does not exist for species = " + species + ", assembly = " + assembly); + throw new CellBaseException("Data release '" + release + "' does not exist" + getSpeciesAssemblyMessage()); } public DataRelease getDefault(String cellBaseVersion) throws CellBaseException { @@ -119,8 +119,7 @@ public DataRelease getDefault(String cellBaseVersion) throws CellBaseException { } } } - throw new CellBaseException("No data release found for CellBase " + cellBaseVersion + " (species = " + species + ", assembly = " - + assembly + ")"); + throw new CellBaseException("No data release found for CellBase " + cellBaseVersion + getSpeciesAssemblyMessage()); } public DataRelease update(int release, List versions) throws CellBaseException { @@ -136,28 +135,27 @@ public DataRelease update(int release, String collection, String data, List newSources = new ArrayList<>(); + List newSources = new ArrayList<>(); // First, add new data sources Set sourceSet = new HashSet<>(); ObjectMapper jsonObjectMapper = new ObjectMapper(); - ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class); + ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataSource.class); for (Path dataSourcePath : dataSourcePaths) { if (dataSourcePath.toFile().exists()) { try { - DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(dataSourcePath.toFile()); - newSources.add(dataReleaseSource); - sourceSet.add(dataReleaseSource.getData() + "__" + dataReleaseSource.getName()); + DataSource dataSource = jsonObjectReader.readValue(dataSourcePath.toFile()); + newSources.add(dataSource); + sourceSet.add(dataSource.getCategory() + "__" + dataSource.getName()); } catch (IOException e) { - logger.warn("Something wrong happened when reading data release source " + dataSourcePath + ". " - + e.getMessage()); + logger.warn("Something wrong happened when reading data release source {}: {}", dataSourcePath, e.getMessage()); } } } // Second, add previous data sources if necessary (to avoid duplicated sources) - for (DataReleaseSource source : currDataRelease.getSources()) { - String key = source.getData() + "__" + source.getName(); + for (DataSource source : currDataRelease.getSources()) { + String key = source.getCategory() + "__" + source.getName(); if (!sourceSet.contains(key)) { newSources.add(source); } @@ -173,7 +171,7 @@ public DataRelease update(int release, String collection, String data, List> tmp = new ArrayList<>(); - for (DataReleaseSource source : dataRelase.getSources()) { + for (DataSource source : dataRelase.getSources()) { Map map = new HashMap<>(); - if (StringUtils.isNotEmpty(source.getData())) { - map.put("data", source.getData()); - } if (StringUtils.isNotEmpty(source.getName())) { map.put("name", source.getName()); } + if (StringUtils.isNotEmpty(source.getCategory())) { + map.put("category", source.getCategory()); + } if (StringUtils.isNotEmpty(source.getVersion())) { map.put("version", source.getVersion()); } - if (CollectionUtils.isNotEmpty(source.getUrl())) { - map.put("url", source.getUrl()); + if (StringUtils.isNotEmpty(source.getDownloadDate())) { + map.put("downloadDate", source.getDownloadDate()); + } + if (CollectionUtils.isNotEmpty(source.getUrls())) { + map.put("urls", source.getUrls()); } - if (StringUtils.isNotEmpty(source.getDate())) { - map.put("date", source.getDate()); + if (CollectionUtils.isNotEmpty(source.getNotes())) { + map.put("notes", source.getUrls()); } tmp.add(map); } @@ -224,8 +225,7 @@ public int checkDataRelease(int inRelease) throws CellBaseException { String[] split = GitRepositoryState.get().getBuildVersion().split("[.-]"); String version = "v" + split[0] + "." + split[1]; outRelease = getDefault(version).getRelease(); - logger.info("Using data release 0: it means to take default data release '" + outRelease + "' for CellBase version '" - + version + "'"); + logger.warn("Using data release 0: it will take the default data release {} for CellBase version {}", outRelease, version); return outRelease; } @@ -236,8 +236,12 @@ public int checkDataRelease(int inRelease) throws CellBaseException { } } - throw new CellBaseException("Invalid data release " + outRelease + " for species = " + species + ", assembly = " + assembly - + ". Valid data releases are: " + StringUtils.join(dataReleases.stream().map(dr -> dr.getRelease()) + throw new CellBaseException("Invalid data release " + outRelease + getSpeciesAssemblyMessage() + ". Valid data releases are: " + + StringUtils.join(dataReleases.stream().map(dr -> dr.getRelease()) .collect(Collectors.toList()), ",")); } + + private String getSpeciesAssemblyMessage() { + return " (species = " + species + ", assembly = " + assembly + ")"; + } } From a3e9684143cfd18d372cc2faff001b53ee77d308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 11 Apr 2024 11:20:10 +0200 Subject: [PATCH 017/148] lib: update CellBase downloaders according to the DownloadProperties.URLProperties changes, #TASK-5775, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 4 +- .../src/main/resources/configuration.yml | 56 +++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 124 +++++++++++++----- .../lib/download/AbstractDownloadManager.java | 2 - .../lib/download/CaddDownloadManager.java | 8 +- .../lib/download/ClinicalDownloadManager.java | 8 +- .../lib/download/GenomeDownloadManager.java | 70 +++++----- .../MissenseScoresDownloadManager.java | 18 ++- .../lib/download/OntologyDownloadManager.java | 8 +- .../lib/download/PharmGKBDownloadManager.java | 6 +- .../lib/download/ProteinDownloadManager.java | 50 +++---- .../lib/download/PubMedDownloadManager.java | 25 ++-- .../download/RegulationDownloadManager.java | 7 +- 13 files changed, 218 insertions(+), 168 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index b1a48dc0f1..8e51ac8b23 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -192,7 +192,7 @@ public void execute() { } private CellBaseBuilder buildRepeats() { - Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER); + Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME))); @@ -349,7 +349,7 @@ private CellBaseBuilder buildConservation() { } private CellBaseBuilder buildClinicalVariants() throws CellBaseException { - Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER_NAME); + Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY); List versionFiles = new ArrayList<>(); List versionFilenames = Arrays.asList(CLINVAR_VERSION_FILENAME, COSMIC_VERSION_FILENAME, GWAS_VERSION_FILENAME, diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 20012d44a1..5052473aa0 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -91,6 +91,8 @@ download: version: "2.0.14" goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz + + ## Regulation mirbase: host: https://www.mirbase.org/download/miRNA.dat version: "22.1" @@ -102,33 +104,39 @@ download: ## Protein Data uniprot: - host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2024-01-24" - uniprotRelNotes: - host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt + host: https://ftp.uniprot.org/ version: "2024-01-24" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2024-01-24" - interproRelNotes: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt + host: https://ftp.ebi.ac.uk/ version: "2024-01-24" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz intact: - host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt + host: https://ftp.ebi.ac.uk/ version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt ## Conservation Scores phastCons: ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M - host: https://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/phastCons470way/put_assembly_here.470way.phastCons/chrput_chromosome_here.phastCons470way.wigFix.gz + host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" + files: + PHASTCONS: goldenPath/put_assembly_here/phastCons470way/put_assembly_here.470way.phastCons/chrput_chromosome_here.phastCons470way.wigFix.gz phylop: ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M - host: https://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/phyloP470way/put_assembly_here.470way.phyloP/chrput_chromosome_here.phyloP470way.wigFix.gz + host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" + files: + PHYLOP: goldenPath/put_assembly_here/phyloP470way/put_assembly_here.470way.phyloP/chrput_chromosome_here.phyloP470way.wigFix.gz gerp: - host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + host: http://ftp.ensembl.org/ version: "2023-05-17" + files: + GERP: pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw ## Clinical Variant clinvar: @@ -165,19 +173,27 @@ download: dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/simpleRepeat.txt.gz + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz windowMasker: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz genomicSuperDups: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz ## Variant Pathogenic Prediction revel: - host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + host: https://zenodo.org/ version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip cadd: host: https://krishna.gs.washington.edu/ version: "1.7" @@ -228,7 +244,7 @@ download: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ version: 2024 files: - - pubmed24n[1..1219..4].xml.gz + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz pharmGKB: host: https://api.pharmgkb.org/v1/download/file/data/ version: v1 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 8c048de1b3..15c93c5101 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -39,9 +39,15 @@ public class EtlCommons { public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; + public static final String GRCH38_NAME = "GRCh38"; + public static final String GRCH37_NAME = "GRCh37"; + public static final String HG38_NAME = "hg38"; + public static final String HG19_NAME = "hg19"; + public static final String SUFFIX_VERSION_FILENAME = "Version.json"; public static final String GENOME_DATA = "genome"; + public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME; public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; @@ -64,23 +70,32 @@ public class EtlCommons { public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; public static final String VARIATION_DATA = "variation"; - public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; - public static final String REGULATION_DATA = "regulation"; - public static final String PROTEIN_DATA = "protein"; public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; + // Pharmacogenomics public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; + public static final String PHARMACOGENOMICS_SUBDIRECTORY = "pharmacogenomics"; + // PharmGKB public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME; + public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; + public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; + + // Missense variantion functional score + public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + // Revel + public static final String REVEL_NAME = "Revel"; + public static final String REVEL_VERSION_FILENAME = "revel" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String REVEL_FILE_ID = "REVEL"; // Clinical variants data - public static final String CLINICAL_VARIANTS_FOLDER_NAME = "clinicalVariant"; + public static final String CLINICAL_VARIANTS_SUBDIRECTORY = "clinicalVariant"; // ClinVar public static final String CLINVAR_NAME = "ClinVar"; public static final String CLINVAR_VERSION_FILENAME = "clinvar" + SUFFIX_VERSION_FILENAME; - public static final String ClINVAR_CHUNKS_FOLDER_NAME = "clinvar_chunks"; + public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; // Must match the configuration file public static final String CLINVAR_FULL_RELEASE_FILE_ID = "FULL_RELEASE"; public static final String CLINVAR_SUMMARY_FILE_ID = "SUMMARY"; @@ -104,10 +119,25 @@ public class EtlCommons { public static final String STRUCTURAL_VARIANTS_DATA = "svs"; public static final String REPEATS_DATA = "repeats"; + public static final String REPEATS_SUBDIRECTORY = "genome"; + public static final String REPEATS_JSON = "repeats"; + // Simple repeats + @Deprecated + public static final String TRF_FILE = "simpleRepeat.txt.gz"; + public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; + public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; + @Deprecated + public static final String GSD_FILE = "genomicSuperDups.txt.gz"; + public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; + public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; + @Deprecated + public static final String WM_FILE = "windowmaskerSdust.txt.gz"; + public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; + public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; // Ontology public static final String ONTOLOGY_DATA = "ontology"; - public static final String ONTOLOGY_FOLDER_NAME = "ontology"; + public static final String ONTOLOGY_SUBDIRECTORY = "ontology"; // HPO public static final String HPO_OBO_NAME = "HPO"; public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; @@ -134,13 +164,26 @@ public class EtlCommons { // Variation functional score public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String VARIATION_FUNCTIONAL_SCORE_FOLDER_NAME = "variation_functional_score"; + public static final String VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY = "variation_functional_score"; // CADD scores public static final String CADD_NAME = "CADD"; public static final String CADD_VERSION_FILENAME = "cadd" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String CADD_FILE_ID = "CADD"; + // Regulation + public static final String REGULATION_DATA = "regulation"; + public static final String REGULATION_SUBDIRECTORY = "regulation"; + // Regulatory/motif features + public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; + public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + // miRBase + public static final String MIRBASE_NAME = "miRBase"; + public static final String MIRBASE_VERSION_FILENAME = "mirbase" + SUFFIX_VERSION_FILENAME; + // miRTarBase + public static final String MIRTARBASE_NAME = "miRTarBase"; + public static final String MIRTARBASE_VERSION_FILENAME = "mirtarbase" + SUFFIX_VERSION_FILENAME; + // Build specific data options public static final String GENOME_INFO_DATA = "genome_info"; public static final String DISGENET_DATA = "disgenet"; @@ -158,23 +201,48 @@ public class EtlCommons { // public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt"; // public static final String HGMD_DATA = "hgmd"; - public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_VERSION_FILE = PUBMED_DATA + SUFFIX_VERSION_FILENAME; - // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; + // Protein + public static final String PROTEIN_DATA = "protein"; + public static final String PROTEIN_SUBDIRECTORY = "protein"; + // UniProt + public static final String UNIPROT_NAME = "UniProt"; + public static final String UNIPROT_CHUNKS_SUBDIRECTORY = "uniprot_chunks"; + public static final String UNIPROT_VERSION_FILENAME = "uniprot" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String UNIPROT_FILE_ID = "UNIPROT"; + // InterPro + public static final String INTERPRO_NAME = "InterPro"; + public static final String INTERPRO_VERSION_FILENAME = "interpro" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String INTERPRO_FILE_ID = "INTERPRO"; + // IntAct + public static final String INTACT_NAME = "IntAct"; + public static final String INTACT_VERSION_FILENAME = "intact" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String INTACT_FILE_ID = "INTACT"; + + // Conservation scores public static final String CONSERVATION_DATA = "conservation"; + public static final String CONSERVATION_SUBDIRECTORY = "conservation"; + // GERP public static final String GERP_NAME = "GERP++"; public static final String GERP_SUBDIRECTORY = "gerp"; - public static final String GERP_VERSION_FILENAME = GERP_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String GERP_VERSION_FILENAME = "gerp" + SUFFIX_VERSION_FILENAME; + public static final String GERP_FILE_ID = "GERP"; + // PHASTCONS public static final String PHASTCONS_NAME = "PhastCons"; public static final String PHASTCONS_SUBDIRECTORY = "phastCons"; - public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String PHASTCONS_VERSION_FILENAME = "phastCons" + SUFFIX_VERSION_FILENAME; + public static final String PHASTCONS_FILE_ID = "PHASTCONS"; + // PHYLOP public static final String PHYLOP_NAME = "PhyloP"; public static final String PHYLOP_SUBDIRECTORY = "phylop"; - public static final String PHYLOP_VERSION_FILENAME = PHYLOP_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String PHYLOP_VERSION_FILENAME = "phylop" + SUFFIX_VERSION_FILENAME; + public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; @@ -196,19 +264,6 @@ public class EtlCommons { public static final String DGV_VERSION_FILE = "dgvVersion.json"; public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants"; - @Deprecated - public static final String TRF_FILE = "simpleRepeat.txt.gz"; - @Deprecated - public static final String GSD_FILE = "genomicSuperDups.txt.gz"; - @Deprecated - public static final String WM_FILE = "windowmaskerSdust.txt.gz"; - - public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; - public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; - public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; - public static final String REPEATS_FOLDER = "genome"; - public static final String REPEATS_JSON = "repeats"; - public static final String OBO_JSON = "ontology"; public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME; @@ -216,16 +271,13 @@ public class EtlCommons { public static final String MONDO_VERSION_FILE = "mondo" + SUFFIX_VERSION_FILENAME; public static final String HGMD_FILE = "hgmd.vcf"; - public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json"; - - public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; - public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; - - public static final String MIRBASE_NAME = "miRBase"; - public static final String MIRBASE_VERSION_FILENAME = MIRBASE_NAME + SUFFIX_VERSION_FILENAME; - public static final String MIRTARBASE_NAME = "miRTarBase"; - public static final String MIRTARBASE_VERSION_FILENAME = MIRTARBASE_NAME + SUFFIX_VERSION_FILENAME; + // PubMed + public static final String PUBMED_NAME = "PubMed"; + public static final String PUBMED_DATA = "pubmed"; + public static final String PUBMED_SUBDIRECTORY = "pubmed"; + public static final String PUBMED_VERSION_FILENAME = "pubmed" + SUFFIX_VERSION_FILENAME; + public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index dcbd93a684..946d868721 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -47,8 +47,6 @@ import java.time.LocalDateTime; import java.util.*; -import static org.opencb.cellbase.lib.EtlCommons.HPO_OBO_FILE_ID; - public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} to {} ..."; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 572588b2d2..6743ed8a06 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -17,9 +17,7 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -42,10 +40,10 @@ public List download() throws IOException, InterruptedException { return null; } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} files ...", CADD_NAME); - - Path variationFunctionalScoreFolder = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_FOLDER_NAME); + Path variationFunctionalScoreFolder = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY); Files.createDirectories(variationFunctionalScoreFolder); + logger.info("Downloading {} files at {} ...", CADD_NAME, variationFunctionalScoreFolder); + // Download CADD and save data source DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_NAME, diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index b1eb9e7192..bb6f53e32d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -51,11 +51,9 @@ public List download() throws IOException, InterruptedException { public List downloadClinical() throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - - logger.info("Downloading clinical information ..."); - - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER_NAME).toAbsolutePath(); + Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY).toAbsolutePath(); Files.createDirectories(clinicalFolder); + logger.info("Downloading clinical information at {} ...", clinicalFolder); String url; List urls; @@ -103,7 +101,7 @@ public List downloadClinical() throws IOException, InterruptedExce clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); // Prepare CliVar chunk files - Path chunksPath = clinicalFolder.resolve(ClINVAR_CHUNKS_FOLDER_NAME); + Path chunksPath = clinicalFolder.resolve(CLINVAR_CHUNKS_SUBDIRECTORY); if (Files.notExists(chunksPath)) { Files.createDirectories(chunksPath); Path clinvarPath = clinicalFolder.resolve(getUrlFilename( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 99e22561ad..210271668f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -81,9 +81,9 @@ public List downloadReferenceGenome() throws IOException, Interrup String outputFileName = StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName() + ".fa.gz"; Path outputPath = sequenceFolder.resolve(outputFileName); - logger.info("Saving reference genome version data at {}", sequenceFolder.resolve("genomeVersion.json")); - saveVersionData(EtlCommons.GENOME_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), - Collections.singletonList(url), sequenceFolder.resolve("genomeVersion.json")); + logger.info("Saving reference genome version data at {}", sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + saveDataSource(ENSEMBL_NAME, EtlCommons.GENOME_DATA, ensemblVersion, getTimeStamp(), + Collections.singletonList(url), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); List downloadFiles = Collections.singletonList(downloadFile(url, outputPath.toString())); logger.info("Unzipping file: {}", outputFileName); EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null); @@ -101,7 +101,7 @@ public List downloadConservation() throws IOException, Interrupted return Collections.emptyList(); } logger.info("Downloading conservation information ..."); - Path conservationFolder = downloadFolder.resolve("conservation"); + Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); List downloadFiles = new ArrayList<>(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { Files.createDirectories(conservationFolder); @@ -112,17 +112,18 @@ public List downloadConservation() throws IOException, Interrupted String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; - if (assemblyConfiguration.getName().equalsIgnoreCase("GRCh38")) { + if (assemblyConfiguration.getName().equalsIgnoreCase(GRCH38_NAME)) { String filename; Path outputPath; - String assembly = "hg38"; + String assembly = HG38_NAME; List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); // Downloading PhastCons and PhyloP logger.info("Downloading {} and {}", PHASTCONS_NAME, PHYLOP_NAME); for (String chromosome : chromosomes) { // PhastCons - String phastConsUrl = configuration.getDownload().getPhastCons().getHost().replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) + String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() + .getFiles().get(PHASTCONS_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phastConsUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve(filename); @@ -131,7 +132,8 @@ public List downloadConservation() throws IOException, Interrupted phastconsUrls.add(phastConsUrl); // PhyloP - String phyloPUrl = configuration.getDownload().getPhylop().getHost().replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) + String phyloPUrl = configuration.getDownload().getPhylop().getHost() + configuration.getDownload().getPhylop() + .getFiles().get(PHYLOP_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phyloPUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHYLOP_SUBDIRECTORY).resolve(filename); @@ -142,26 +144,27 @@ public List downloadConservation() throws IOException, Interrupted // Downloading Gerp logger.info("Downloading {}", GERP_NAME); - String gerpUrl = configuration.getDownload().getGerp().getHost(); + String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() + .get(GERP_FILE_ID); filename = Paths.get(gerpUrl).getFileName().toString(); outputPath = conservationFolder.resolve(GERP_SUBDIRECTORY).resolve(filename); logger.info("Downloading from {} to {}", gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); // Save data version - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, configuration.getDownload().getPhastCons().getVersion(), + saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, configuration.getDownload().getPhylop().getVersion(), + saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); - saveVersionData(EtlCommons.CONSERVATION_DATA, GERP_NAME, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + saveDataSource(GERP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), Collections.singletonList(gerpUrl), conservationFolder.resolve(GERP_VERSION_FILENAME)); } } if (speciesConfiguration.getScientificName().equals("Mus musculus")) { Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve("phastCons")); - Files.createDirectories(conservationFolder.resolve("phylop")); + Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); + Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); String url = configuration.getDownload().getConservation().getHost() + "/mm10"; String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", @@ -170,18 +173,18 @@ public List downloadConservation() throws IOException, Interrupted List phyloPUrls = new ArrayList<>(chromosomes.length); for (String chromosome : chromosomes) { String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; - downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve("phastCons").resolve("chr" + chromosome + downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome + ".phastCons60way.wigFix.gz").toString())); phastconsUrls.add(phastConsUrl); String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; - downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve("phylop").resolve("chr" + chromosome + downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome + ".phyloP60way.wigFix.gz").toString())); phyloPUrls.add(phyloPUrl); } - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, null, getTimeStamp(), phastconsUrls, - conservationFolder.resolve("phastConsVersion.json")); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, null, getTimeStamp(), phyloPUrls, - conservationFolder.resolve("phastConsVersion.json")); + saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), + getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); + saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), + getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); } return downloadFiles; } @@ -192,21 +195,22 @@ public List downloadRepeats() throws IOException, InterruptedExcep } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading repeats data ..."); - Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER); + Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); Files.createDirectories(repeatsFolder); List downloadFiles = new ArrayList<>(); String pathParam; - if (assemblyConfiguration.getName().equalsIgnoreCase("grch38")) { - pathParam = "hg38"; + if (assemblyConfiguration.getName().equalsIgnoreCase(GRCH38_NAME)) { + pathParam = HG38_NAME; } else { - logger.error("Please provide a valid human assembly {GRCh37, GRCh38)"); + logger.error("Please provide a valid human assembly: {}, {}", GRCH37_NAME, GRCH38_NAME); throw new ParameterException("Assembly '" + assemblyConfiguration.getName() + "' is not valid. Please provide " - + "a valid human assembly {GRCh37, GRCh38)"); + + "a valid human assembly: " + GRCH37_NAME + ", " + GRCH38_NAME); } // Download tandem repeat finder - String url = configuration.getDownload().getSimpleRepeats().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveVersionData(EtlCommons.REPEATS_DATA, TRF_NAME, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), + String url = configuration.getDownload().getSimpleRepeats().getHost() + configuration.getDownload().getSimpleRepeats() + .getFiles().get(SIMPLE_REPEATS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); + saveDataSource(TRF_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); Path outputPath = repeatsFolder.resolve(getUrlFilename(url)); @@ -214,8 +218,9 @@ public List downloadRepeats() throws IOException, InterruptedExcep downloadFiles.add(downloadFile(url, outputPath.toString())); // Download genomic super duplications - url = configuration.getDownload().getGenomicSuperDups().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveVersionData(EtlCommons.REPEATS_DATA, GSD_NAME, configuration.getDownload().getGenomicSuperDups().getVersion(), + url = configuration.getDownload().getGenomicSuperDups().getHost() + configuration.getDownload().getGenomicSuperDups() + .getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); + saveDataSource(GSD_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getUrlFilename(url)); @@ -223,9 +228,10 @@ public List downloadRepeats() throws IOException, InterruptedExcep downloadFiles.add(downloadFile(url, outputPath.toString())); // Download WindowMasker - if (!pathParam.equalsIgnoreCase("hg19")) { - url = configuration.getDownload().getWindowMasker().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveVersionData(EtlCommons.REPEATS_DATA, WM_NAME, configuration.getDownload().getWindowMasker().getVersion(), + if (!pathParam.equalsIgnoreCase(HG19_NAME)) { + url = configuration.getDownload().getWindowMasker().getHost() + configuration.getDownload().getWindowMasker().getFiles() + .get(WINDOW_MASKER_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); + saveDataSource(WM_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getUrlFilename(url)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index 1ae2514e49..0dba31ed78 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -26,6 +26,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class MissenseScoresDownloadManager extends AbstractDownloadManager { public MissenseScoresDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -39,17 +41,13 @@ public List download() throws IOException, InterruptedException { } public DownloadFile downloadRevel() throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading Revel data ..."); - - Path missensePredictionScore = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - Files.createDirectories(missensePredictionScore); - - String url = configuration.getDownload().getRevel().getHost(); + if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { + Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); + Files.createDirectories(missensePredictionScorePath); + logger.info("Downloading Revel data at {} ...", missensePredictionScorePath); - saveVersionData(EtlCommons.MISSENSE_VARIATION_SCORE_DATA, "Revel", null, getTimeStamp(), - Collections.singletonList(url), missensePredictionScore.resolve("revelVersion.json")); - return downloadFile(url, missensePredictionScore.resolve("revel_grch38_all_chromosomes.csv.zip").toString()); + return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_NAME, + MISSENSE_VARIATION_SCORE_DATA, REVEL_FILE_ID, REVEL_VERSION_FILENAME, missensePredictionScorePath); } return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 70fbc6f6a1..7e730a8b0a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -17,15 +17,12 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -38,10 +35,9 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec } public List download() throws IOException, InterruptedException { - logger.info("Downloading {} files ...", ONTOLOGY_DATA); - - Path oboFolder = downloadFolder.resolve(ONTOLOGY_FOLDER_NAME); + Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); Files.createDirectories(oboFolder); + logger.info("Downloading {} files {} ...", ONTOLOGY_DATA, oboFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 812dcd996a..04e72d3247 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -43,10 +43,10 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException { - logger.info("Downloading PharmGKB files..."); DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); - Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); Files.createDirectories(pharmgkbDownloadFolder); + logger.info("Downloading {} files at {} ...", PHARMGKB_DATA, pharmgkbDownloadFolder); List urls = new ArrayList<>(); List downloadFiles = new ArrayList<>(); @@ -67,7 +67,7 @@ public List download() throws IOException, InterruptedException { } // Save versions - saveVersionData(PHARMACOGENOMICS_DATA, PHARMGKB_NAME, pharmGKB.getVersion(), getTimeStamp(), urls, + saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_DATA, pharmGKB.getVersion(), getTimeStamp(), urls, pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); return downloadFiles; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 5a722ed448..9ebf9aa2b2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -18,7 +18,6 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; @@ -27,14 +26,11 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -public class ProteinDownloadManager extends AbstractDownloadManager { +import static org.opencb.cellbase.lib.EtlCommons.*; - private static final String UNIPROT_NAME = "UniProt"; - private static final String INTERPRO_NAME = "InterPro"; - private static final String INTACT_NAME = "IntAct"; +public class ProteinDownloadManager extends AbstractDownloadManager { public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -49,39 +45,35 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws InterruptedException if there is an error downloading files * */ public List download() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "protein")) { + if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { return null; } - logger.info("Downloading protein information ..."); - Path proteinFolder = downloadFolder.resolve("protein"); + Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); Files.createDirectories(proteinFolder); + logger.info("Downloading protein information at {} ..."); + + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Uniprot - String url = configuration.getDownload().getUniprot().getHost(); - downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString())); - Files.createDirectories(proteinFolder.resolve("uniprot_chunks")); - splitUniprot(proteinFolder.resolve("uniprot_sprot.xml.gz"), proteinFolder.resolve("uniprot_chunks")); - - String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost(); - downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1), - getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json")); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_NAME, PROTEIN_DATA, UNIPROT_FILE_ID, + UNIPROT_VERSION_FILENAME, proteinFolder); + Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + String uniprotFilename = getUrlFilename(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); + logger.info("Split UniProt file {} into chunks at {}", uniprotFilename, chunksPath); + Files.createDirectories(chunksPath); + splitUniprot(proteinFolder.resolve(uniprotFilename), chunksPath); + downloadFiles.add(downloadFile); // Interpro - String interproUrl = configuration.getDownload().getInterpro().getHost(); - downloadFiles.add(downloadFile(interproUrl, proteinFolder.resolve("protein2ipr.dat.gz").toString())); - - relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); - downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), - getTimeStamp(), Collections.singletonList(interproUrl), proteinFolder.resolve("interproVersion.json")); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_NAME, PROTEIN_DATA, INTERPRO_FILE_ID, + INTERPRO_VERSION_FILENAME, proteinFolder); + downloadFiles.add(downloadFile); // Intact - String intactUrl = configuration.getDownload().getIntact().getHost(); - downloadFiles.add(downloadFile(intactUrl, proteinFolder.resolve("intact.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, configuration.getDownload().getIntact().getVersion(), - getTimeStamp(), Collections.singletonList(intactUrl), proteinFolder.resolve("intactVersion.json")); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_NAME, PROTEIN_DATA, INTACT_FILE_ID, + INTACT_VERSION_FILENAME, proteinFolder); + downloadFiles.add(downloadFile); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index e913539d5b..e5a8c78f26 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -27,11 +27,8 @@ import java.util.Collections; import java.util.List; -import static org.opencb.cellbase.lib.EtlCommons.PUBMED_VERSION_FILE; - public class PubMedDownloadManager extends AbstractDownloadManager { - private static final String PUBMED_NAME = "PubMed"; public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -39,29 +36,29 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException { - logger.info("Downloading PubMed XML files..."); - - Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); + Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY); Files.createDirectories(pubmedFolder); + logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder); // Downloads PubMed XML files String url = configuration.getDownload().getPubmed().getHost(); - String regexp = configuration.getDownload().getPubmed().getFiles().get(0); + String regexp = configuration.getDownload().getPubmed().getFiles().get(EtlCommons.PUBMED_REGEX_FILE_ID); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); int start = Integer.parseInt(split[0]); int end = Integer.parseInt(split[1]); int padding = Integer.parseInt(split[2]); - saveVersionData(EtlCommons.PUBMED_DATA, PUBMED_NAME, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), - Collections.singletonList(url), pubmedFolder.resolve(PUBMED_VERSION_FILE)); - - List list = new ArrayList<>(); + List downloadFiles = new ArrayList<>(); for (int i = start; i <= end; i++) { String filename = name[0] + String.format("%0" + padding + "d", i) + name[2]; - logger.info("\tDownloading file {}", filename); - list.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); + logger.info("\tDownloading from {} to {} ", url + "/" + filename, pubmedFolder.resolve(filename)); + downloadFiles.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); } - return list; + + saveDataSource(EtlCommons.PUBMED_NAME, EtlCommons.PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), + Collections.singletonList(url), pubmedFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); + + return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 8b0cf01abb..546bb2dc7e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -52,13 +52,12 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { - if (!speciesHasInfoToDownload(speciesConfiguration, "regulation")) { + if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { return Collections.emptyList(); } - this.regulationFolder = downloadFolder.resolve("regulation"); + regulationFolder = downloadFolder.resolve(REGULATION_SUBDIRECTORY); Files.createDirectories(regulationFolder); - - logger.info("Downloading regulation information ..."); + logger.info("Downloading {} files at {} ...", REGULATION_DATA, regulationFolder); List downloadFiles = new ArrayList<>(); From c7ad55d8d2c113dd0d4b2c60dd237b4085a118fb Mon Sep 17 00:00:00 2001 From: imedina Date: Thu, 11 Apr 2024 11:27:59 +0100 Subject: [PATCH 018/148] Rename get file name method --- .../lib/download/AbstractDownloadManager.java | 11 +++++----- .../lib/download/ClinicalDownloadManager.java | 4 ++-- .../lib/download/GeneDownloadManager.java | 20 +++++++++---------- .../lib/download/GenomeDownloadManager.java | 6 +++--- .../lib/download/ProteinDownloadManager.java | 2 +- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 946d868721..c87e2a9512 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -138,11 +138,10 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) } protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String name, String category, String fileId, - String versionFilename, Path outPath) - throws IOException, InterruptedException { + String versionFilename, Path outPath) throws IOException, InterruptedException { logger.info("Downloading {} ({}) file ...", name, category); String url = props.getHost() + props.getFiles().get(fileId); - File outFile = outPath.resolve(getUrlFilename(url)).toFile(); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); DownloadFile downloadFile = downloadFile(url, outPath.toString()); @@ -270,12 +269,12 @@ private boolean validateDownloadFile(DownloadFile downloadFile, String outputFil private long getExpectedFileSize(String outputFileLog) { try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { - String line = null; + String line; while ((line = reader.readLine()) != null) { // looking for: Length: 13846591 (13M) if (line.startsWith("Length:")) { String[] parts = line.split("\\s"); - return Long.valueOf(parts[1]); + return Long.parseLong(parts[1]); } } } catch (Exception e) { @@ -294,7 +293,7 @@ private String getEnsemblURL(SpeciesConfiguration sp) { } } - protected String getUrlFilename(String url) { + protected String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index bb6f53e32d..a274df11a4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -91,7 +91,7 @@ public List downloadClinical() throws IOException, InterruptedExce for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, CLINVAR_EFO_TERMS_FILE_ID)) { url = props.getHost() + props.getFiles().get(fileId); - outPath = clinicalFolder.resolve(getUrlFilename(url)); + outPath = clinicalFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outPath); downloadFiles.add(downloadFile(url, outPath.toString())); urls.add(url); @@ -104,7 +104,7 @@ public List downloadClinical() throws IOException, InterruptedExce Path chunksPath = clinicalFolder.resolve(CLINVAR_CHUNKS_SUBDIRECTORY); if (Files.notExists(chunksPath)) { Files.createDirectories(chunksPath); - Path clinvarPath = clinicalFolder.resolve(getUrlFilename( + Path clinvarPath = clinicalFolder.resolve(getFilenameFromUrl( props.getHost() + props.getFiles().get(CLINVAR_FULL_RELEASE_FILE_ID))); logger.info("Splitting {} in {} ...", clinvarPath, chunksPath); splitClinvar(clinvarPath, chunksPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index d66f149c04..843bc360e3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -177,7 +177,7 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope String versionFilename, Path refSeqFolder) throws IOException, InterruptedException { String url = urlProperties.getHost(); String version = urlProperties.getVersion(); - String filename = getUrlFilename(url); + String filename = getFilenameFromUrl(url); Path outputPath = refSeqFolder.resolve(filename); saveDataSource(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), refSeqFolder.resolve(versionFilename)); @@ -193,7 +193,7 @@ private DownloadFile downloadMane(Path geneFolder) throws IOException, Interrupt saveDataSource(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(MANE_SELECT_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -207,7 +207,7 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte saveDataSource(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(LRG_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -221,7 +221,7 @@ private DownloadFile downloadHgnc(Path geneFolder) throws IOException, Interrupt saveDataSource(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(HGNC_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -235,7 +235,7 @@ private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, saveDataSource(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(CANCER_HOTSPOT_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -249,7 +249,7 @@ private DownloadFile downloadGO(Path geneFolder) throws IOException, Interrupted saveDataSource(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GO_ANNOTATION_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -263,7 +263,7 @@ private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOExcepti saveDataSource(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GNOMAD_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -277,7 +277,7 @@ private DownloadFile downloadDrugData(Path geneFolder) throws IOException, Inter saveDataSource(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DGIDB_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -309,7 +309,7 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOExcep saveDataSource(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(GENE_EXPRESSION_ATLAS_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(geneGtfUrl)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(geneGtfUrl)); logger.info(DOWNLOADING_LOG_MESSAGE, geneGtfUrl, outputPath); return downloadFile(geneGtfUrl, outputPath.toString()); } @@ -325,7 +325,7 @@ private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOExc saveDataSource(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DISGINET_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 210271668f..bbd25cf8f7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -213,7 +213,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep saveDataSource(TRF_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); - Path outputPath = repeatsFolder.resolve(getUrlFilename(url)); + Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); @@ -223,7 +223,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep saveDataSource(GSD_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getUrlFilename(url)); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); @@ -234,7 +234,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep saveDataSource(WM_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getUrlFilename(url)); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 9ebf9aa2b2..799bc92aad 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -59,7 +59,7 @@ public List download() throws IOException, InterruptedException { downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_NAME, PROTEIN_DATA, UNIPROT_FILE_ID, UNIPROT_VERSION_FILENAME, proteinFolder); Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - String uniprotFilename = getUrlFilename(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); + String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); logger.info("Split UniProt file {} into chunks at {}", uniprotFilename, chunksPath); Files.createDirectories(chunksPath); splitUniprot(proteinFolder.resolve(uniprotFilename), chunksPath); From e92b67680200bfdf0cb744809855e32e11112f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 11 Apr 2024 18:02:23 +0200 Subject: [PATCH 019/148] lib: update CellBase downloaders according to the DownloadProperties.URLProperties changes, #TASK-5775, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 8 +- .../cellbase/app/cli/admin/AdminMain.java | 1 - .../admin/executors/BuildCommandExecutor.java | 4 +- .../executors/DownloadCommandExecutor.java | 101 +-- .../executors/ExportCommandExecutor.java | 6 +- .../admin/executors/LoadCommandExecutor.java | 8 +- .../core/exception/CellBaseException.java | 3 + .../src/main/resources/configuration.yml | 14 +- .../org/opencb/cellbase/lib/EtlCommons.java | 29 +- .../lib/builders/OntologyBuilder.java | 10 +- .../builders/RegulatoryFeatureBuilder.java | 11 +- .../lib/builders/RegulatoryRegionBuilder.java | 607 ------------------ .../lib/download/GenomeDownloadManager.java | 8 - .../download/RegulationDownloadManager.java | 180 +++--- .../lib/managers/DataReleaseManager.java | 3 - 15 files changed, 207 insertions(+), 786 deletions(-) delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 6049ef9b4b..55342641b3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -88,11 +88,13 @@ public class DownloadCommandOptions { public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1) + + "variation_functional_score, missense_variation_functional_score, regulation, protein, conservation, " + + "clinical_variants, repeats, ontology, pubmed and pharmacogenomics; or use 'all' to download everything", + required = true, arity = 1) public String data; - @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) + @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, + arity = 1) public String outputDirectory; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index 10c43d637c..d77722a492 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -103,5 +103,4 @@ public static void main(String[] args) { } } } - } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8e51ac8b23..71b20e8b5a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -157,7 +157,7 @@ public void execute() { case EtlCommons.REPEATS_DATA: parser = buildRepeats(); break; - case EtlCommons.OBO_DATA: + case ONTOLOGY_DATA: parser = buildObo(); break; case EtlCommons.SPLICE_SCORE_DATA: @@ -202,7 +202,7 @@ private CellBaseBuilder buildRepeats() { } private CellBaseBuilder buildObo() { - Path oboDir = downloadFolder.resolve(EtlCommons.OBO_DATA); + Path oboDir = downloadFolder.resolve(ONTOLOGY_DATA); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.OBO_JSON); return new OntologyBuilder(oboDir, serializer); } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8197e6558..f8d3e04eb9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -16,15 +16,11 @@ package org.opencb.cellbase.app.cli.admin.executors; -import com.beust.jcommander.ParameterException; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; -import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.download.AbstractDownloadManager; import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.cellbase.lib.download.Downloader; @@ -36,6 +32,8 @@ import java.util.Arrays; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -44,6 +42,10 @@ public class DownloadCommandExecutor extends CommandExecutor { private AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; private Path outputDirectory; + private static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + ONTOLOGY_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { super(downloadCommandOptions.commonOptions.logLevel, downloadCommandOptions.commonOptions.conf); @@ -52,88 +54,95 @@ public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions down } /** - * Execute specific 'download' command options. + * Process CellBase command 'download'. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { try { String species = downloadCommandOptions.speciesAndAssemblyOptions.species; String assembly = downloadCommandOptions.speciesAndAssemblyOptions.assembly; List downloadFiles = new ArrayList<>(); - List dataList = getDataList(species); + List dataList = checkDataSources(); Downloader downloader = new Downloader(species, assembly, outputDirectory, configuration); for (String data : dataList) { switch (data) { - case EtlCommons.GENOME_DATA: + case GENOME_DATA: downloadFiles.addAll(downloader.downloadGenome()); break; - case EtlCommons.GENE_DATA: + case GENE_DATA: downloadFiles.addAll(downloader.downloadGene()); break; -// case EtlCommons.VARIATION_DATA: -// downloadManager.downloadVariation(); -// break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: downloadFiles.addAll(downloader.downloadCaddScores()); break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: downloadFiles.addAll(downloader.downloadPredictionScores()); break; - case EtlCommons.REGULATION_DATA: + case REGULATION_DATA: downloadFiles.addAll(downloader.downloadRegulation()); break; - case EtlCommons.PROTEIN_DATA: + case PROTEIN_DATA: downloadFiles.addAll(downloader.downloadProtein()); break; - case EtlCommons.CONSERVATION_DATA: + case CONSERVATION_DATA: downloadFiles.addAll(downloader.downloadConservation()); break; - case EtlCommons.CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANTS_DATA: downloadFiles.addAll(downloader.downloadClinicalVariants()); break; -// case EtlCommons.STRUCTURAL_VARIANTS_DATA: -// downloadFiles.add(downloadManager.downloadStructuralVariants()); -// break; - case EtlCommons.REPEATS_DATA: + case REPEATS_DATA: downloadFiles.addAll(downloader.downloadRepeats()); break; - case EtlCommons.OBO_DATA: + case ONTOLOGY_DATA: downloadFiles.addAll(downloader.downloadOntologies()); break; - case EtlCommons.PUBMED_DATA: + case PUBMED_DATA: downloadFiles.addAll(downloader.downloadPubMed()); break; - case EtlCommons.PHARMACOGENOMICS_DATA: + case PHARMACOGENOMICS_DATA: downloadFiles.addAll(downloader.downloadPharmKGB()); break; default: - System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" - + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," - + " regulation, protein, conservation, clinical_variants, ontology, pubmed}"); - break; + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); - } catch (ParameterException | IOException | CellBaseException | InterruptedException | NoSuchMethodException - | FileFormatException e) { - logger.error("Error in 'download' command line: " + e.getMessage()); + } catch (IOException | NoSuchMethodException | FileFormatException e) { + throw new CellBaseException("Error executing command line 'download'", e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing command line 'download'", e); } } - private List getDataList(String species) throws CellBaseException { - if (StringUtils.isEmpty(downloadCommandOptions.data) || downloadCommandOptions.data.equals("all")) { - return SpeciesUtils.getSpeciesConfiguration(configuration, species).getData(); - } else { - return Arrays.asList(downloadCommandOptions.data.split(",")); + private List checkDataSources() { + if (StringUtils.isEmpty(downloadCommandOptions.data)) { + throw new IllegalArgumentException("Missing data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } - } - - @Deprecated - private List getDataList(SpeciesConfiguration sp) { - List dataList; - if (downloadCommandOptions.data.equals("all")) { - dataList = sp.getData(); - } else { - dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + List dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + case GENE_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: + case REGULATION_DATA: + case PROTEIN_DATA: + case CONSERVATION_DATA: + case CLINICAL_VARIANTS_DATA: + case REPEATS_DATA: + case ONTOLOGY_DATA: + case PUBMED_DATA: + case PHARMACOGENOMICS_DATA: + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); + } } return dataList; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 72f992f344..85446fac1f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -86,7 +86,7 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + ONTOLOGY_DATA, MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); } @@ -309,7 +309,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " repeats"; break; } - case OBO_DATA: { + case ONTOLOGY_DATA: { counter = exportOntologyData(); counterMsg = counter + " ontology items"; break; @@ -449,7 +449,7 @@ private int exportClinicalVariantData(List regions) throws CellBaseExcep private int exportOntologyData() throws CellBaseException, IOException { int counter = 0; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, OBO_DATA); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, ONTOLOGY_DATA); OntologyManager ontologyManager = managerFactory.getOntologyManager(species, assembly); CellBaseIterator iterator = ontologyManager.iterator(new OntologyQuery()); while (iterator.hasNext()) { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 6af05bf732..ca1a4a9a71 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -80,8 +80,8 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA, - EtlCommons.PHARMACOGENOMICS_DATA}; + EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, + EtlCommons.PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -268,7 +268,7 @@ public void execute() throws CellBaseException { // case EtlCommons.STRUCTURAL_VARIANTS_DATA: // loadStructuralVariants(); // break; - case EtlCommons.OBO_DATA: { + case EtlCommons.ONTOLOGY_DATA: { // Load data loadIfExists(input.resolve("ontology.json.gz"), "ontology"); @@ -281,7 +281,7 @@ public void execute() throws CellBaseException { input.resolve(EtlCommons.GO_VERSION_FILE), input.resolve(EtlCommons.DO_VERSION_FILE) )); - dataReleaseManager.update(dataRelease, "ontology", EtlCommons.OBO_DATA, sources); + dataReleaseManager.update(dataRelease, "ontology", EtlCommons.ONTOLOGY_DATA, sources); break; } case EtlCommons.SPLICE_SCORE_DATA: { diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java index 884c63f2ae..422a52b0d4 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java @@ -22,5 +22,8 @@ public CellBaseException(String msg) { super(msg); } + public CellBaseException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 5052473aa0..28263dfb6e 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -53,7 +53,11 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: ftp://ftp.ensembl.org/pub/ + files: + REGULATORY_BUILD: "regulation/put_species_here/*Regulatory_Build.regulatory_features*.gff.gz" + MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -94,13 +98,17 @@ download: ## Regulation mirbase: - host: https://www.mirbase.org/download/miRNA.dat + host: https://www.mirbase.org/ version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" + files: + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx ## Protein Data uniprot: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 15c93c5101..207841aabb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -35,7 +35,15 @@ */ public class EtlCommons { + // Ensembl public static final String ENSEMBL_NAME = "ENSEMBL"; + public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; + public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; + public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; + // Must match the configuration file + public static final String REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; + public static final String MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; + public static final String MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; @@ -122,14 +130,19 @@ public class EtlCommons { public static final String REPEATS_SUBDIRECTORY = "genome"; public static final String REPEATS_JSON = "repeats"; // Simple repeats + public static final String TRF_NAME = "Tandem Repeats Finder"; @Deprecated public static final String TRF_FILE = "simpleRepeat.txt.gz"; public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; + // Genomic super duplications + public static final String GSD_NAME = "Genomic Super Duplications"; @Deprecated public static final String GSD_FILE = "genomicSuperDups.txt.gz"; public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; + // Window masker + public static final String WM_NAME = "Window Masker"; @Deprecated public static final String WM_FILE = "windowmaskerSdust.txt.gz"; public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; @@ -174,15 +187,22 @@ public class EtlCommons { // Regulation public static final String REGULATION_DATA = "regulation"; public static final String REGULATION_SUBDIRECTORY = "regulation"; - // Regulatory/motif features - public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; - public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) + public static final String REGULATORY_BUILD_NAME = "Regulatory Build"; + public static final String REGULATORY_BUILD_VERSION_FILENAME = "regulatoryBuild" + SUFFIX_VERSION_FILENAME; + // Motif features (see Ensembl files) + public static final String MOTIF_FEATURES_NAME = "Motif Features"; + public static final String MOTIF_FEATURES_VERSION_FILENAME = "motifFeatures" + SUFFIX_VERSION_FILENAME; // miRBase public static final String MIRBASE_NAME = "miRBase"; public static final String MIRBASE_VERSION_FILENAME = "mirbase" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String MIRBASE_FILE_ID = "MIRBASE"; // miRTarBase public static final String MIRTARBASE_NAME = "miRTarBase"; - public static final String MIRTARBASE_VERSION_FILENAME = "mirtarbase" + SUFFIX_VERSION_FILENAME; + public static final String MIRTARBASE_VERSION_FILENAME = "mirTarBase" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String MIRTARBASE_FILE_ID = "MIRTARBASE"; // Build specific data options public static final String GENOME_INFO_DATA = "genome_info"; @@ -224,7 +244,6 @@ public class EtlCommons { // Must match the configuration file public static final String INTACT_FILE_ID = "INTACT"; - // Conservation scores public static final String CONSERVATION_DATA = "conservation"; public static final String CONSERVATION_SUBDIRECTORY = "conservation"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 1eabf8975a..cbe7c56952 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -20,7 +20,6 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; @@ -36,10 +35,11 @@ public class OntologyBuilder extends CellBaseBuilder { public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { super(serializer); - hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); - goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); - doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); - mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); + // TODO: fix it !! +// hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); +// goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); +// doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); +// mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); } @Override diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index 03fc3a1cd6..d1ae5fb205 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -21,12 +21,12 @@ import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.core.RegulatoryFeature; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.HashSet; +import java.util.Set; public class RegulatoryFeatureBuilder extends CellBaseBuilder { @@ -35,7 +35,9 @@ public class RegulatoryFeatureBuilder extends CellBaseBuilder { public RegulatoryFeatureBuilder(Path regulatoryDirectoryPath, CellBaseSerializer serializer) { super(serializer); - gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); + // TODO: fix it ! + gffFile = null; +// gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); } @Override @@ -44,7 +46,8 @@ public void parse() throws Exception { if (Files.exists(gffFile)) { parseGffFile(gffFile); } else { - logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); + // TODO: fix it +// logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); logger.warn("Skipping regulatory features GFF file parsing. Regulatory feature data models will not be built."); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java deleted file mode 100644 index 3727ac4a69..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java +++ /dev/null @@ -1,607 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.opencb.biodata.models.core.RegulatoryFeature; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.*; -import java.util.*; - -/** - * User: fsalavert. - * Date: 4/10/13 - * Time: 10:14 AM - */ -@Deprecated -public class RegulatoryRegionBuilder extends CellBaseBuilder { - - private static final int CHUNK_SIZE = 2000; - private static final String REGULATORY_FEATURES = "regulatory_features"; - @Deprecated - private static final String DEPRECATED_MOTIF_FEATURES = "deprecated_motif_features"; - private static final String MOTIF_FEATURES = "motif_features"; - private static final String FEATURE_TYPE = "feature_type"; - private static final String ID = "id"; - private static final String BINDING_MATRIX = "binding_matrix"; - private static final String MOTIF_FEATURE_TYPE = "motif_feature_type"; - private Path regulatoryRegionPath; - - public RegulatoryRegionBuilder(Path regulatoryRegionFilesDir, CellBaseSerializer serializer) { - super(serializer); - - this.regulatoryRegionPath = regulatoryRegionFilesDir; - - } - - public void createSQLiteRegulatoryFiles(Path regulatoryRegionPath) - throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - List gffColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "group"); - List gffColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT", "TEXT"); - - // Path regulatoryRegionPath = regulationDir.toPath(); - - Path filePath; - - filePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, REGULATORY_FEATURES, gffColumnNames, gffColumnTypes); - - filePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - // TODO: REMOVE - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DEPRECATED - filePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "annotated_features", gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, DEPRECATED_MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "regulatory_features_multicell", gffColumnNames, gffColumnTypes); - // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< DEPRECATED - - - -// GFFColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame"); -// GFFColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT"); - filePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz"); - if (Files.exists(filePath)) { - createSQLiteRegulatoryFiles(filePath, "mirna_uniq", gffColumnNames, gffColumnTypes); - } - - } - - @Override - public void parse() throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - if (regulatoryRegionPath == null || !Files.exists(regulatoryRegionPath) || !Files.isDirectory(regulatoryRegionPath)) { - throw new IOException("Regulation directory whether does not exist, is not a directory or cannot be read"); - } - - // Create the SQLite databases - createSQLiteRegulatoryFiles(regulatoryRegionPath); - - String chunkIdSuffix = CHUNK_SIZE / 1000 + "k"; - - Path regulatoryFilePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE + ".db"); - Path motifFilePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".db"); - Path annotatedFilePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz.db"); - Path deprecatedMotifFilePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz.db"); - Path deprecatedRegulatoryFilePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz.db"); - Path mirnaFilePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz.db"); - - List filePaths = Arrays.asList(regulatoryFilePath, motifFilePath, annotatedFilePath, - deprecatedMotifFilePath, deprecatedRegulatoryFilePath); - List tableNames = Arrays.asList(REGULATORY_FEATURES, MOTIF_FEATURES, "annotated_features", - DEPRECATED_MOTIF_FEATURES, "regulatory_features_multicell"); - - if (Files.exists(mirnaFilePath)) { - filePaths.add(mirnaFilePath); - tableNames.add("mirna_uniq"); - } - - // Fetching and joining all chromosomes found in the different databases - Set setChr = new HashSet<>(); - setChr.addAll(getChromosomesList(regulatoryFilePath, REGULATORY_FEATURES)); - setChr.addAll(getChromosomesList(motifFilePath, MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(annotatedFilePath, "annotated_features")); - setChr.addAll(getChromosomesList(deprecatedMotifFilePath, DEPRECATED_MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(deprecatedRegulatoryFilePath, "regulatory_features_multicell")); - if (Files.exists(mirnaFilePath)) { - setChr.addAll(getChromosomesList(mirnaFilePath, "mirna_uniq")); - } - - List chromosomes = new ArrayList<>(setChr); - List regulatoryFeatures; - HashSet chunksHash; - for (String chromosome : chromosomes) { - for (int i = 0; i < tableNames.size(); i++) { - chunksHash = new HashSet<>(); - regulatoryFeatures = queryChromosomesRegulatoryDB(filePaths.get(i), tableNames.get(i), chromosome); - for (RegulatoryFeature regulatoryFeature : regulatoryFeatures) { - int firstChunkId = getChunkId(regulatoryFeature.getStart(), CHUNK_SIZE); - int lastChunkId = getChunkId(regulatoryFeature.getEnd(), CHUNK_SIZE); - - List chunkIds = new ArrayList<>(); - String chunkId; - for (int j = firstChunkId; j <= lastChunkId; j++) { - chunkId = chromosome + "_" + j + "_" + chunkIdSuffix; - chunkIds.add(chunkId); - //count chunks - if (!chunksHash.contains(j)) { - chunksHash.add(j); - } - } -// regulatoryFeature.setChunkIds(chunkIds); - - // remove 'chr' prefix -// if (genericFeature.getChromosome() != null) { -// genericFeature.setSequenceName(genericFeature.getSequenceName().replace("chr", "")); -// } - serializer.serialize(regulatoryFeature); - } - } - } - } - - - public void createSQLiteRegulatoryFiles(Path filePath, String tableName, List columnNames, List columnTypes) - throws ClassNotFoundException, IOException, SQLException { - int limitRows = 100000; - int batchCount = 0; - - if (!Files.exists(filePath) || Files.size(filePath) == 0) { - return; - } - - Path dbPath = Paths.get(filePath.toString() + ".db"); - if (Files.exists(dbPath) && Files.size(dbPath) > 0) { - return; - } - - BufferedReader br = FileUtils.newBufferedReader(filePath); - - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - conn.setAutoCommit(false); //Set false to perform commits manually and increase performance on insertion - - //Create table query - Statement createTables = conn.createStatement(); - - StringBuilder sbQuery = new StringBuilder(); - sbQuery.append("CREATE TABLE if not exists " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { //columnNames and columnTypes must have the same size - sbQuery.append("'" + columnNames.get(i) + "' " + columnTypes.get(i) + ","); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - - System.out.println(sbQuery.toString()); - createTables.executeUpdate(sbQuery.toString()); - - //Prepare insert query - sbQuery = new StringBuilder(); - sbQuery.append("INSERT INTO " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { - sbQuery.append("'" + columnNames.get(i) + "',"); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(") values ("); - sbQuery.append(repeat("?,", columnNames.size())); - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - System.out.println(sbQuery.toString()); - - PreparedStatement ps = conn.prepareStatement(sbQuery.toString()); - - //Read file - String line = null; - while ((line = br.readLine()) != null) { - - insertByType(ps, getFields(line, tableName), columnTypes); - ps.addBatch(); - batchCount++; - - //commit batch - if (batchCount % limitRows == 0 && batchCount != 0) { - ps.executeBatch(); - conn.commit(); - } - - } - br.close(); - - //Execute last Batch - ps.executeBatch(); - conn.commit(); - - //Create index - System.out.println("creating indices..."); - createTables.executeUpdate("CREATE INDEX " + tableName + "_seqname_idx on " + tableName + "(" + columnNames.get(0) + ")"); - System.out.println("indices created."); - - conn.commit(); - conn.close(); - } - - public List getChromosomesList(Path dbPath, String tableName) throws IOException { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - List chromosomes = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName); -// ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName + " where seqname like 'chr%'"); - - while (rs.next()) { - chromosomes.add(rs.getString(1)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return chromosomes; - } - - public List queryChromosomesRegulatoryDB(Path dbPath, String tableName, String chromosome) { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - Connection conn; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='" + chromosome + "'"); -// ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='chr" + chromosome + "'"); - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - public static List queryRegulatoryDB(Path dbPath, String tableName, String chrFile, int start, int end) { - Connection conn = null; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where start<=" + end + " AND end>=" + start); - - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs, String tableName) throws SQLException { - RegulatoryFeature regulatoryFeature = null; - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - regulatoryFeature = getRegulatoryFeature(rs); - break; - case MOTIF_FEATURES: - regulatoryFeature = getMotifFeature(rs); - break; - case "annotated_features": - regulatoryFeature = getAnnotatedFeature(rs); - break; - case "regulatory_features_multicell": - regulatoryFeature = getDeprecatedRegulatoryFeature(rs); - break; - case DEPRECATED_MOTIF_FEATURES: - regulatoryFeature = getDeprecatedMotifFeature(rs); - break; - case "mirna_uniq": - regulatoryFeature = getMirnaFeature(rs); - break; - default: - break; - } - return regulatoryFeature; - } - - private static RegulatoryFeature getMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - // Seems weird that the motif_feature_type property is used to fill the Name field. However, this is how the - // it was being done from the previous ENSEMBL files - regulatoryFeature.setName(groupFields.get(MOTIF_FEATURE_TYPE)); - - regulatoryFeature.setMatrix(groupFields.get(BINDING_MATRIX)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setId(groupFields.get(ID)); - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(groupFields.get(FEATURE_TYPE).replace(" ", "_")); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getAnnotatedFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setName(groupFields.get("name")); - regulatoryFeature.setAlias(groupFields.get("alias")); - regulatoryFeature.setFeatureClass(groupFields.get("class")); - regulatoryFeature.getCellTypes().add(groupFields.get("cell_type")); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - regulatoryFeature.setFrame(rs.getString(9)); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3) + "_motif"); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - String[] split = groupFields.get("name").split(":"); - regulatoryFeature.setName(split[0]); - regulatoryFeature.setMatrix(split[1]); - - return regulatoryFeature; - } - - private static RegulatoryFeature getMirnaFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setFeatureClass("microRNA"); - regulatoryFeature.setName(groupFields.get("name")); - - return regulatoryFeature; - } - - private static Map getGroupFields(String group) { - //process group column - Map groupFields = new HashMap<>(); - String[] attributeFields = group.split(";"); - String[] attributeKeyValue; - for (String attributeField : attributeFields) { - attributeKeyValue = attributeField.trim().split("="); - groupFields.put(attributeKeyValue[0].toLowerCase(), attributeKeyValue[1]); - } - return groupFields; - } - - - public static List getFields(String line, String tableName) { - List fields = new ArrayList<>(); - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - fields = getRegulatoryFeaturesFields(line); - break; - case MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "annotated_features": - fields = getAnnotatedFeaturesFields(line); - break; - case "regulatory_features_multicell": - fields = getRegulatoryFeaturesFields(line); - break; - case DEPRECATED_MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "mirna_uniq": - fields = getMirnaFeaturesFields(line); - break; - default: - break; - } - return fields; - } - - @Deprecated - public static List getAnnotatedFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getRegulatoryFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMotifFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMirnaFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static void insertByType(PreparedStatement ps, List fields, List types) throws SQLException { - //Datatypes In SQLite Version 3 -> http://www.sqlite.org/datatype3.html - String raw; - String type; - if (types.size() == fields.size()) { - for (int i = 0; i < fields.size(); i++) { //columnNames and columnTypes must have same size - int sqliteIndex = i + 1; - raw = fields.get(i); - type = types.get(i); - - switch (type) { - case "INTEGER": - case "INT": - ps.setInt(sqliteIndex, Integer.parseInt(raw)); - break; - case "REAL": - ps.setFloat(sqliteIndex, Float.parseFloat(raw)); - break; - case "TEXT": - ps.setString(sqliteIndex, raw); - break; - default: - ps.setString(sqliteIndex, raw); - break; - } - } - } - - } - - public String repeat(String s, int n) { - if (s == null) { - return null; - } - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < n; i++) { - sb.append(s); - } - return sb.toString(); - } - - private int getChunkId(int position, int chunksize) { - if (chunksize <= 0) { - return position / CHUNK_SIZE; - } else { - return position / chunksize; - } - } - - private int getChunkStart(int id, int chunksize) { - if (chunksize <= 0) { - return (id == 0) ? 1 : id * CHUNK_SIZE; - } else { - return (id == 0) ? 1 : id * chunksize; - } - } - - private int getChunkEnd(int id, int chunksize) { - if (chunksize <= 0) { - return (id * CHUNK_SIZE) + CHUNK_SIZE - 1; - } else { - return (id * chunksize) + chunksize - 1; - } - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index bbd25cf8f7..df4aa069bf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -34,14 +34,6 @@ public class GenomeDownloadManager extends AbstractDownloadManager { - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String TRF_NAME = "Tandem repeats finder"; - private static final String GSD_NAME = "Genomic super duplications"; - private static final String WM_NAME = "WindowMasker"; - - private static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; - private static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; - public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 546bb2dc7e..1ca0693b80 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -16,27 +16,15 @@ package org.opencb.cellbase.lib.download; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gff.io.Gff2Reader; -import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.RegulatoryPfm; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; -import java.util.concurrent.TimeUnit; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -51,7 +39,7 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C } @Override - public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { + public List download() throws IOException, InterruptedException { if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { return Collections.emptyList(); } @@ -69,100 +57,108 @@ public List download() throws IOException, InterruptedException, N } /** - * Downloads Ensembl regulatory buid and motif feature files. + * Downloads Ensembl regulatory build and motif feature files. * @throws IOException Any issue when writing files * @throws InterruptedException Any issue downloading files */ - private List downloadRegulatoryaAndMotifFeatures() - throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { - String regulationUrl = ensemblHostUrl + "/" + ensemblRelease; + private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException { + String baseUrl = ensemblHostUrl + "/" + ensemblRelease; if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - regulationUrl = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); + baseUrl = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); } - regulationUrl += "/regulation/" + speciesShortName; List downloadFiles = new ArrayList<>(); - Path outputFile = regulationFolder.resolve(EtlCommons.REGULATORY_FEATURES_FILE); - String regulatoryBuildUrl = regulationUrl + "/*Regulatory_Build.regulatory_features*.gff.gz"; - downloadFiles.add(downloadFile(regulatoryBuildUrl, outputFile.toString())); - - outputFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); - String motifUrl = regulationUrl + "/MotifFeatures/*" + assemblyConfiguration.getName() + ".motif_features.gff.gz"; - downloadFiles.add(downloadFile(motifUrl, outputFile.toString())); - - String motifTbiUrl = regulationUrl + "/MotifFeatures/*" + assemblyConfiguration.getName() + ".motif_features.gff.gz.tbi"; - outputFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".tbi"); - downloadFiles.add(downloadFile(motifTbiUrl, outputFile.toString())); - - loadPfmMatrices(); + // Regulatory build + String url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(REGULATORY_BUILD_FILE_ID)) + .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName); + String outputFileName = getFilenameFromUrl(url); + Path outputPath = regulationFolder.resolve(outputFileName); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + // Save data source (name, category, version,...) + saveDataSource(REGULATORY_BUILD_NAME, REGULATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), + regulationFolder.resolve(REGULATORY_BUILD_VERSION_FILENAME)); + + // Motif features + List urls = new ArrayList<>(); + url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_FILE_ID)) + .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); + outputFileName = getFilenameFromUrl(url); + outputPath = regulationFolder.resolve(outputFileName); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + urls.add(url); + // Motif features index + url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_INDEX_FILE_ID)) + .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); + outputFileName = getFilenameFromUrl(url); + outputPath = regulationFolder.resolve(outputFileName); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + // Save data source (name, category, version,...) + saveDataSource(REGULATORY_BUILD_NAME, MOTIF_FEATURES_NAME, ensemblVersion, getTimeStamp(), urls, + regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); + + // This will be executed in the CellBase build +// loadPfmMatrices(); return downloadFiles; } - private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { - logger.info("Downloading and building pfm matrices..."); - if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { - logger.info("regulatory_pfm.json.gz is already built"); - return; - } - Set motifIds = new HashSet<>(); - Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); - try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { - Gff2 tfbsMotifFeature; - Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); - while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { - String pfmId = getMatrixId(filePattern, tfbsMotifFeature); - if (StringUtils.isNotEmpty(pfmId)) { - motifIds.add(pfmId); - } - } - } - - ObjectMapper mapper = new ObjectMapper(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); - if (logger.isInfoEnabled()) { - logger.info("Looking up {} pfms", motifIds.size()); - } - for (String pfmId : motifIds) { - String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId - + "?unit=frequencies;content-type=application/json"; - URL url = new URL(urlString); - RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); - serializer.serialize(regulatoryPfm); - // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits - TimeUnit.MILLISECONDS.sleep(250); - } - serializer.close(); - } - - private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { - Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); - if (matcher.find()) { - return matcher.group(0); - } - return null; - } +// private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { +// logger.info("Downloading and building pfm matrices..."); +// if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { +// logger.info("regulatory_pfm.json.gz is already built"); +// return; +// } +// Set motifIds = new HashSet<>(); +// Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); +// try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { +// Gff2 tfbsMotifFeature; +// Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); +// while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { +// String pfmId = getMatrixId(filePattern, tfbsMotifFeature); +// if (StringUtils.isNotEmpty(pfmId)) { +// motifIds.add(pfmId); +// } +// } +// } +// +// ObjectMapper mapper = new ObjectMapper(); +// CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); +// if (logger.isInfoEnabled()) { +// logger.info("Looking up {} pfms", motifIds.size()); +// } +// for (String pfmId : motifIds) { +// String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId +// + "?unit=frequencies;content-type=application/json"; +// URL url = new URL(urlString); +// RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); +// serializer.serialize(regulatoryPfm); +// // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits +// TimeUnit.MILLISECONDS.sleep(250); +// } +// serializer.close(); +// } +// +// private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { +// Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); +// if (matcher.find()) { +// return matcher.group(0); +// } +// return null; +// } private DownloadFile downloadMirna() throws IOException, InterruptedException { logger.info("Downloading {} ...", MIRBASE_NAME); - String url = configuration.getDownload().getMirbase().getHost(); - - saveVersionData(EtlCommons.REGULATION_DATA, MIRBASE_NAME, configuration.getDownload().getMirbase().getVersion(), getTimeStamp(), - Collections.singletonList(url), regulationFolder.resolve(MIRBASE_VERSION_FILENAME)); - Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName()); - logger.info("Downloading from {} to {} ...", url, outputPath); - return downloadFile(url, outputPath.toString()); + return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_NAME, REGULATION_DATA, MIRBASE_FILE_ID, + MIRBASE_VERSION_FILENAME, regulationFolder); } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException { logger.info("Downloading {} ...", MIRTARBASE_NAME); - String url = configuration.getDownload().getMiRTarBase().getHost(); - - saveVersionData(EtlCommons.REGULATION_DATA, MIRTARBASE_NAME, configuration.getDownload().getMiRTarBase().getVersion(), - getTimeStamp(), Collections.singletonList(url), regulationFolder.resolve(MIRTARBASE_VERSION_FILENAME)); - Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName()); - logger.info("Downloading from {} to {} ...", url, outputPath); - return downloadFile(url, outputPath.toString()); + return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_NAME, REGULATION_DATA, MIRTARBASE_FILE_ID, + MIRBASE_VERSION_FILENAME, regulationFolder); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java index 3bc97b1824..507f554eab 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java @@ -199,9 +199,6 @@ public void update(DataRelease dataRelase) { if (CollectionUtils.isNotEmpty(source.getUrls())) { map.put("urls", source.getUrls()); } - if (CollectionUtils.isNotEmpty(source.getNotes())) { - map.put("notes", source.getUrls()); - } tmp.add(map); } releaseDBAdaptor.update(dataRelase.getRelease(), "sources", tmp); From e18506b9580d443a518833cf893b6261a8af7a19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 12 Apr 2024 13:39:14 +0200 Subject: [PATCH 020/148] lib: update CellBase downloaders, #TASK-5775, #TASK-5564 --- .../src/main/resources/configuration.yml | 8 +- .../org/opencb/cellbase/lib/EtlCommons.java | 52 ++++++++++- .../lib/download/AbstractDownloadManager.java | 93 ++++++++++++++++--- .../lib/download/CaddDownloadManager.java | 2 +- .../lib/download/ClinicalDownloadManager.java | 4 +- .../MissenseScoresDownloadManager.java | 4 +- .../lib/download/OntologyDownloadManager.java | 2 +- .../lib/download/ProteinDownloadManager.java | 5 +- .../download/RegulationDownloadManager.java | 68 ++++++-------- 9 files changed, 174 insertions(+), 64 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 28263dfb6e..3b237d5c3f 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -53,11 +53,11 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub/ + host: https://ftp.ensembl.org/pub/ files: - REGULATORY_BUILD: "regulation/put_species_here/*Regulatory_Build.regulatory_features*.gff.gz" - MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz" - MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz.tbi" + REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 207841aabb..279bf27ce1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -16,9 +16,11 @@ package org.opencb.cellbase.lib; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.core.config.Configurator; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -368,7 +370,55 @@ public static Long countFileLines(Path filePath) throws IOException { } return nLines; } + } + + public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, String ensemblRelease, String fileId, String species, + String assembly, String chromosome) throws CellBaseException { + if (!props.getUrl().getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.EnsemblProperties within the CellBase" + + " configuration file"); + } + String filesValue = props.getUrl().getFiles().get(fileId); + String url = props.getUrl().getHost() + ensemblRelease + "/" + filesValue; + // Change species, assembly, chromosome if necessary + if (StringUtils.isNotEmpty(species)) { + url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } + public static String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + return getUrl(props, fileId, null, null, null); } + public static String getUrl(DownloadProperties.URLProperties props, String fileId, String species, String assembly, String chromosome) + throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + String url; + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + url = filesValue; + } else { + url = props.getHost() + filesValue; + } + if (StringUtils.isNotEmpty(species)) { + url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index c87e2a9512..74ecbe4d4a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -126,7 +126,7 @@ private void init() throws CellBaseException, IOException { logger.info("Processing species {}", speciesConfiguration.getScientificName()); } - public abstract List download() throws IOException, InterruptedException; + public abstract List download() throws IOException, InterruptedException, CellBaseException; protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { boolean hasInfo = true; @@ -137,26 +137,67 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) return hasInfo; } - protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String name, String category, String fileId, - String versionFilename, Path outPath) throws IOException, InterruptedException { - logger.info("Downloading {} ({}) file ...", name, category); - String url = props.getHost() + props.getFiles().get(fileId); - File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); - DownloadFile downloadFile = downloadFile(url, outPath.toString()); + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, + String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, name, category, null, versionFilename, outPath); + } + + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, + String chromosome, String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); // Save data source - saveDataSource(name, category, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + saveDataSource(name, category, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), outPath.resolve(versionFilename)); return downloadFile; } - protected String getTimeStamp() { - return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String name, + String category, String chromosome, String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(name, category, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; } - protected void saveDataSource(String name, String category, String version, String date, List urls, Path outputFilePath) + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadDataSource(props, fileId, null, outPath); + } + + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + return downloadFile(url, outFile.toString()); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadEnsemblDataSource(ensemblProps, fileId, null, outPath); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), + chromosome); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + return downloadFile(url, outFile.toString()); + } + + protected void saveDataSource(String name, String category, String version, String date, List urls, Path versionFilePath) throws IOException { DataSource dataSource = new DataSource(name, category, version, date, urls); @@ -165,7 +206,11 @@ protected void saveDataSource(String name, String category, String version, Stri dataSource.setVersion(date); } - dataSourceWriter.writeValue(outputFilePath.toFile(), dataSource); + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + } + + protected String getTimeStamp() { + return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); } protected String getLine(Path readmePath, int lineNumber) { @@ -293,6 +338,28 @@ private String getEnsemblURL(SpeciesConfiguration sp) { } } + @Deprecated + protected String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + return filesValue; + } else { + return props.getHost() + filesValue; + } + } + + protected String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + return getFilenameFromUrl(props.getFiles().get(fileId)); + } + protected String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 6743ed8a06..af3ff65baf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -35,7 +35,7 @@ public CaddDownloadManager(String species, String assembly, Path targetDirectory } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA)) { return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index a274df11a4..1e66f1b5f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -43,13 +43,13 @@ public ClinicalDownloadManager(String species, String assembly, Path outdir, Cel } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadClinical()); return downloadFiles; } - public List downloadClinical() throws IOException, InterruptedException { + public List downloadClinical() throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY).toAbsolutePath(); Files.createDirectories(clinicalFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index 0dba31ed78..50cf9ee0c0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -36,11 +36,11 @@ public MissenseScoresDownloadManager(String species, String assembly, Path targe } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { return Collections.singletonList(downloadRevel()); } - public DownloadFile downloadRevel() throws IOException, InterruptedException { + public DownloadFile downloadRevel() throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); Files.createDirectories(missensePredictionScorePath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 7e730a8b0a..b09cf76f2f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -34,7 +34,7 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec super(species, assembly, targetDirectory, configuration); } - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); Files.createDirectories(oboFolder); logger.info("Downloading {} files {} ...", ONTOLOGY_DATA, oboFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 799bc92aad..519ea828d1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -42,9 +42,10 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * * @return list of files downloaded * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files * + * @throws InterruptedException if there is an error downloading files + * @throws CellBaseException if there is an error in the CelllBase configuration file */ - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 1ca0693b80..26ed4776da 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -39,7 +39,7 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { return Collections.emptyList(); } @@ -61,52 +61,44 @@ public List download() throws IOException, InterruptedException { * @throws IOException Any issue when writing files * @throws InterruptedException Any issue downloading files */ - private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException { - String baseUrl = ensemblHostUrl + "/" + ensemblRelease; - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - baseUrl = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } + private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException, CellBaseException { +// String baseUrl; +// if (configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { +// baseUrl = ensemblHostUrl + ensemblRelease + "/"; +// } else { +// baseUrl = ensemblHostUrl + ensemblRelease + "/" + getPhylo(speciesConfiguration) + "/"; +// } + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Regulatory build - String url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(REGULATORY_BUILD_FILE_ID)) - .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName); - String outputFileName = getFilenameFromUrl(url); - Path outputPath = regulationFolder.resolve(outputFileName); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - // Save data source (name, category, version,...) - saveDataSource(REGULATORY_BUILD_NAME, REGULATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), - regulationFolder.resolve(REGULATORY_BUILD_VERSION_FILENAME)); + downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), REGULATORY_BUILD_FILE_ID, + REGULATORY_BUILD_NAME, REGULATION_DATA, null, REGULATORY_BUILD_VERSION_FILENAME, regulationFolder); + downloadFiles.add(downloadFile); - // Motif features + // Motifs features List urls = new ArrayList<>(); - url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_FILE_ID)) - .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); - outputFileName = getFilenameFromUrl(url); - outputPath = regulationFolder.resolve(outputFileName); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - urls.add(url); - // Motif features index - url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_INDEX_FILE_ID)) - .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); - outputFileName = getFilenameFromUrl(url); - outputPath = regulationFolder.resolve(outputFileName); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_FILE_ID, null, regulationFolder); + downloadFiles.add(downloadFile); + urls.add(downloadFile.getUrl()); + // And now the index file + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_INDEX_FILE_ID, null, + regulationFolder); + downloadFiles.add(downloadFile); + urls.add(downloadFile.getUrl()); // Save data source (name, category, version,...) - saveDataSource(REGULATORY_BUILD_NAME, MOTIF_FEATURES_NAME, ensemblVersion, getTimeStamp(), urls, + saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), urls, regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); - // This will be executed in the CellBase build + // TODO: This will be executed in the CellBase build // loadPfmMatrices(); return downloadFiles; } -// private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { +// private void loadPfmMatrices() +// throws IOException, NoSuchMethodException, FileFormatException, InterruptedException, CellBaseException { // logger.info("Downloading and building pfm matrices..."); // if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { // logger.info("regulatory_pfm.json.gz is already built"); @@ -150,15 +142,15 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept // return null; // } - private DownloadFile downloadMirna() throws IOException, InterruptedException { + private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading {} ...", MIRBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_NAME, REGULATION_DATA, MIRBASE_FILE_ID, + return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_NAME, REGULATION_DATA, MIRBASE_VERSION_FILENAME, regulationFolder); } - private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException { + private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading {} ...", MIRTARBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_NAME, REGULATION_DATA, MIRTARBASE_FILE_ID, - MIRBASE_VERSION_FILENAME, regulationFolder); + return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, MIRTARBASE_NAME, REGULATION_DATA, + MIRTARBASE_VERSION_FILENAME, regulationFolder); } } From 69a58bf698262fa67677f7f4c263d2d616118bf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 15 Apr 2024 08:48:59 +0200 Subject: [PATCH 021/148] core: update CellBase configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 3b237d5c3f..061725feb1 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -70,6 +70,16 @@ download: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt version: "2023-11-01" refSeq: + host: https://ftp.ncbi.nih.gov/refseq/ + version: "October 16, 2023 (GRCh38.p14)" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + + + host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz version: "2023-10-11" refSeqFasta: From d4e0cd659e81a6072a1c87e9ce635867042247f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 11:20:49 +0200 Subject: [PATCH 022/148] lib: update MANE Select downloader, #TASK-5775, #TASK-5564 --- .../src/main/resources/configuration.yml | 18 ++------- .../org/opencb/cellbase/lib/EtlCommons.java | 7 ++++ .../lib/download/GeneDownloadManager.java | 39 ++++++++----------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 061725feb1..7ebb8d5f6e 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -77,23 +77,11 @@ download: GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz - - - - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz - version: "2023-10-11" - refSeqFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz - version: "2023-10-11" - refSeqProteinFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz - version: "2023-10-11" - refSeqCdna: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz - version: "2023-10-11" maneSelect: - host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz + host: https://ftp.ncbi.nlm.nih.gov/refseq/ version: "1.1" + files: + MANE_SELECT: MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz lrg: host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt version: "2021-03-30" diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 279bf27ce1..cd3fef74ff 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -61,7 +61,14 @@ public class EtlCommons { public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; + + // MANE Select + public static final String MANE_SELECT_NAME = "MANE Select"; public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; + + public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME; public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME; public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotspot" + SUFFIX_VERSION_FILENAME; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 843bc360e3..9e27ae22f1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -32,19 +32,19 @@ public class GeneDownloadManager extends AbstractDownloadManager { - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String REFSEQ_NAME = "RefSeq"; - private static final String UNIPROT_NAME = "UniProt"; - private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; - private static final String HPO_NAME = "HPO"; - private static final String DISGENET_NAME = "DisGeNET"; - private static final String MANE_SELECT_NAME = "MANE Select"; - private static final String LRG_NAME = "LRG"; - private static final String HGNC_GENE_NAME = "HGNC Gene"; - private static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; - private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; - private static final String DGIDB_NAME = "DGIdb"; - private static final String GNOMAD_NAME = "gnomAD"; +// private static final String ENSEMBL_NAME = "ENSEMBL"; +// private static final String REFSEQ_NAME = "RefSeq"; +// private static final String UNIPROT_NAME = "UniProt"; +// private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; +// private static final String HPO_NAME = "HPO"; +// private static final String DISGENET_NAME = "DisGeNET"; +// private static final String MANE_SELECT_NAME = "MANE Select"; +// private static final String LRG_NAME = "LRG"; +// private static final String HGNC_GENE_NAME = "HGNC Gene"; +// private static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; +// private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; +// private static final String DGIDB_NAME = "DGIdb"; +// private static final String GNOMAD_NAME = "gnomAD"; private static final Map GENE_UNIPROT_XREF_FILES; @@ -186,16 +186,11 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope return downloadFile(url, outputPath.toString()); } - private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading MANE Select ..."); - String url = configuration.getDownload().getManeSelect().getHost(); - saveDataSource(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(MANE_SELECT_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", MANE_SELECT_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, MANE_SELECT_NAME, GENE_DATA, + MANE_SELECT_VERSION_FILENAME, geneFolder); } return null; } From 6ee2f78f82322a0d9e9f3eec6945a2c7b8d8d74e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 13:01:40 +0200 Subject: [PATCH 023/148] lib: update LRG, HGNC, Cancer HotSpot, DGIDB, Gene Uniprot Xref, Gene Expression Atlas, Gene Disease Annotation, gnomAD Constraints and GO Annotation downloaders, #TASK-5775, #TASK-5564 --- .../src/main/resources/configuration.yml | 64 ++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 72 ++++++-- .../lib/download/GeneDownloadManager.java | 161 ++++++------------ 3 files changed, 150 insertions(+), 147 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 7ebb8d5f6e..70acaf8776 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -66,9 +66,6 @@ download: libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub - hgnc: - host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: "2023-11-01" refSeq: host: https://ftp.ncbi.nih.gov/refseq/ version: "October 16, 2023 (GRCh38.p14)" @@ -83,16 +80,52 @@ download: files: MANE_SELECT: MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz lrg: - host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + host: http://ftp.ebi.ac.uk/ version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2023-11-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://old.dgidb.org/ + version: "2022-02-01" + files: + DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + host: http://ftp.uniprot.org/ version: "2024_01 (24-Jan-2024)" + files: + UNIPROT_XREF: pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz geneExpressionAtlas: - host: https://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations + disgenet: + host: https://www.disgenet.org/ + version: "7.0 (January 2020)" + files: + DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz ## Regulation mirbase: @@ -173,9 +206,6 @@ download: GWAS: pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: @@ -206,22 +236,8 @@ download: files: CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz - hpo: - ## NOTE: Download manually from here now - host: https://hpo.jax.org/app/data/annotations - disgenet: - host: https://www.disgenet.org/ - version: "7.0 (January 2020)" - files: - ALL_GENE_DISEASE_ASSOCIATIONS: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz - dgidb: - host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv - version: "2022-02-01" reactome: host: http://www.reactome.org/download/current/biopax.zip - gnomadConstraints: - host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: "2.1.1" ## OBO Ontologies hpoObo: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index cd3fef74ff..b7100a18a6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -62,30 +62,78 @@ public class EtlCommons { public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; + // RefSeq + public static final String REFSEQ_NAME = "RefSeq"; + public static final String REFSEQ_VERSION_FILENAME = "refSeq" + SUFFIX_VERSION_FILENAME; +// public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME; +// public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME; +// public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; + // MANE Select public static final String MANE_SELECT_NAME = "MANE Select"; public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; - + // LRG + public static final String LRG_NAME = "LRG"; public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String LRG_FILE_ID = "LRG"; + + // HGNC + public static final String HGNC_NAME = "HGNC Gene"; public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME; - public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotspot" + SUFFIX_VERSION_FILENAME; - public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME; - public static final String GNOMAD_VERSION_FILENAME = "gnomad" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String HGNC_FILE_ID = "HGNC"; + + // Cancer HotSpot + public static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; + public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotSpot" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String CANCER_HOTSPOT_FILE_ID = "CANCER_HOTSPOT"; + + // DGID (drug) + public static final String DGIDB_NAME = "DGIdb"; public static final String DGIDB_VERSION_FILENAME = "dgidb" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String DGIDB_FILE_ID = "DGIDB"; + + // UniProt Xref + public static final String UNIPROT_XREF_NAME = "UniProt Xref"; public static final String UNIPROT_XREF_VERSION_FILENAME = "uniprotXref" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String UNIPROT_XREF_FILE_ID = "UNIPROT_XREF"; + + // Gene Expression Atlas + public static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; public static final String GENE_EXPRESSION_ATLAS_VERSION_FILENAME = "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String GENE_EXPRESSION_ATLAS_FILE_ID = "GENE_EXPRESSION_ATLAS"; + + // Gene Disease Annotation + public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; + // HPO + public static final String HPO_NAME = "HPO"; public static final String HPO_VERSION_FILENAME = "hpo" + SUFFIX_VERSION_FILENAME; - public static final String DISGINET_VERSION_FILENAME = "disgenet" + SUFFIX_VERSION_FILENAME; - - public static final String REFSEQ_DATA = "refseq"; - public static final String REFSEQ_VERSION_FILENAME = REFSEQ_DATA + SUFFIX_VERSION_FILENAME; - public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME; - public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME; - public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; - public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; + // DISGENET + public static final String DISGENET_NAME = "DisGeNet"; + public static final String DISGENET_VERSION_FILENAME = "disGeNet" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String DISGENET_FILE_ID = "DISGENET"; + + // gnomAD Constraints + public static final String GNOMAD_CONSTRAINTS_NAME = "gnomAD Constraints"; + public static final String GNOMAD_CONSTRAINTS_VERSION_FILENAME = "gnomadConstraints" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String GNOMAD_CONSTRAINTS_FILE_ID = "GNOMAD_CONSTRAINTS"; + + // GO Annotation + public static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; + public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; + public static final String VARIATION_DATA = "variation"; public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 9e27ae22f1..679b9aaa95 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -32,20 +32,6 @@ public class GeneDownloadManager extends AbstractDownloadManager { -// private static final String ENSEMBL_NAME = "ENSEMBL"; -// private static final String REFSEQ_NAME = "RefSeq"; -// private static final String UNIPROT_NAME = "UniProt"; -// private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; -// private static final String HPO_NAME = "HPO"; -// private static final String DISGENET_NAME = "DisGeNET"; -// private static final String MANE_SELECT_NAME = "MANE Select"; -// private static final String LRG_NAME = "LRG"; -// private static final String HGNC_GENE_NAME = "HGNC Gene"; -// private static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; -// private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; -// private static final String DGIDB_NAME = "DGIdb"; -// private static final String GNOMAD_NAME = "gnomAD"; - private static final Map GENE_UNIPROT_XREF_FILES; static { @@ -64,7 +50,7 @@ public GeneDownloadManager(String species, String assembly, Path targetDirectory } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading gene information ..."); Path geneFolder = downloadFolder.resolve("gene"); Files.createDirectories(geneFolder); @@ -143,6 +129,7 @@ private List downloadRefSeq(Path refSeqFolder) throws IOException, String timeStamp = getTimeStamp(); // gtf + dow DownloadFile downloadFile = downloadRefSeqFile(REFSEQ_NAME, configuration.getDownload().getRefSeq(), timeStamp, REFSEQ_VERSION_FILENAME, refSeqFolder); downloadFiles.add(downloadFile); @@ -179,7 +166,7 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope String version = urlProperties.getVersion(); String filename = getFilenameFromUrl(url); Path outputPath = refSeqFolder.resolve(filename); - saveDataSource(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), + saveDataSource(name, EtlCommons.REFSEQ_NAME, version, timeStamp, Collections.singletonList(url), refSeqFolder.resolve(versionFilename)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); @@ -195,133 +182,85 @@ private DownloadFile downloadMane(Path geneFolder) throws IOException, Interrupt return null; } - private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading LRG data ..."); - String url = configuration.getDownload().getLrg().getHost(); - saveDataSource(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(LRG_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", LRG_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, LRG_VERSION_FILENAME, + geneFolder); } return null; } - private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading HGNC data ..."); - String url = configuration.getDownload().getHgnc().getHost(); - saveDataSource(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(HGNC_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", HGNC_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, + HGNC_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading Cancer Hotspot ..."); - String url = configuration.getDownload().getCancerHotspot().getHost(); - saveDataSource(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(CANCER_HOTSPOT_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", CANCER_HOTSPOT_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, CANCER_HOTSPOT_NAME, + GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading GO annotation..."); - String url = configuration.getDownload().getGoAnnotation().getHost(); - saveDataSource(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GO_ANNOTATION_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", DGIDB_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, GENE_DATA, + DGIDB_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading gnomAD constraints data..."); - String url = configuration.getDownload().getGnomadConstraints().getHost(); - saveDataSource(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GNOMAD_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { + logger.info("Downloading {} ...", UNIPROT_XREF_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, UNIPROT_XREF_NAME, + GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading drug-gene data..."); - String url = configuration.getDownload().getDgidb().getHost(); - saveDataSource(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), - Collections.singletonList(url), geneFolder.resolve(DGIDB_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); - } - return null; + private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + logger.info("Downloading {} ...", GENE_EXPRESSION_ATLAS_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), GENE_EXPRESSION_ATLAS_FILE_ID, + GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); } - private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException { - if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - logger.info("Downloading UniProt ID mapping ..."); + private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + logger.info("Downloading {} ...", GENE_DISEASE_ANNOTATION_NAME); - String filename = GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName()); - String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/" + filename; + // IMPORTANT !!! + logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", + HPO_NAME, configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); + saveDataSource(HPO_NAME, GENE_DISEASE_ANNOTATION_NAME, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(configuration.getDownload().getHpo().getHost()), geneFolder.resolve(HPO_VERSION_FILENAME)); - saveDataSource(EtlCommons.GENE_DATA, UNIPROT_NAME, - configuration.getDownload().getGeneUniprotXref().getVersion(), getTimeStamp(), - Collections.singletonList(geneGtfUrl), geneFolder.resolve(UNIPROT_XREF_VERSION_FILENAME)); + return downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, + GENE_DISEASE_ANNOTATION_NAME, DISGENET_VERSION_FILENAME, geneFolder); + } - Path outputPath = geneFolder.resolve(filename); - logger.info(DOWNLOADING_LOG_MESSAGE, geneGtfUrl, outputPath); - return downloadFile(geneGtfUrl, outputPath.toString()); + private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("Downloading {} ...", GNOMAD_CONSTRAINTS_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, + GNOMAD_CONSTRAINTS_NAME, GENE_DATA, GNOMAD_CONSTRAINTS_VERSION_FILENAME, geneFolder); } - return null; } - private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene expression atlas ..."); - String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost(); - saveDataSource(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), - getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(GENE_EXPRESSION_ATLAS_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(geneGtfUrl)); - logger.info(DOWNLOADING_LOG_MESSAGE, geneGtfUrl, outputPath); - return downloadFile(geneGtfUrl, outputPath.toString()); - } - - private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene disease annotation ..."); - - // IMPORTANT !!! - logger.warn("HPO must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", - configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); - - String url = configuration.getDownload().getDisgenet().getHost(); - saveDataSource(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DISGINET_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("Downloading {} ...", GO_ANNOTATION_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, GO_ANNOTATION_NAME, + GENE_DATA, GO_ANNOTATION_VERSION_FILENAME, geneFolder); + } + return null; } } From d794ceb9add20c22c9698beddd920c3ef8f8468c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 14:28:14 +0200 Subject: [PATCH 024/148] lib: update RefSeq downloader, #TASK-5775, #TASK-5564 --- .../core/config/DownloadProperties.java | 80 ------------------- .../org/opencb/cellbase/lib/EtlCommons.java | 9 ++- .../lib/download/GeneDownloadManager.java | 61 ++++---------- .../lib/download/GenomeDownloadManager.java | 50 ++++++------ 4 files changed, 46 insertions(+), 154 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 6d03f28148..bb44f91138 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -28,12 +28,8 @@ public class DownloadProperties { private URLProperties hgnc; private URLProperties cancerHotspot; private URLProperties refSeq; - private URLProperties refSeqFasta; - private URLProperties refSeqProteinFasta; - private URLProperties refSeqCdna; private URLProperties maneSelect; private URLProperties lrg; - private URLProperties geneUniprotXref; private URLProperties geneExpressionAtlas; private URLProperties mirbase; @@ -44,8 +40,6 @@ public class DownloadProperties { private URLProperties intact; private URLProperties interpro; private URLProperties interproRelNotes; - @Deprecated - private URLProperties conservation; private URLProperties phastCons; private URLProperties phylop; private URLProperties gerp; @@ -56,12 +50,6 @@ public class DownloadProperties { private URLProperties clinvarEfoTerms; private URLProperties cosmic; private URLProperties hgmd; - @Deprecated - private URLProperties iarctp53; - @Deprecated - private URLProperties docm; - @Deprecated - private URLProperties docmVersion; private URLProperties dgv; private URLProperties simpleRepeats; private URLProperties windowMasker; @@ -201,17 +189,6 @@ public DownloadProperties setInterproRelNotes(URLProperties interproRelNotes) { return this; } - @Deprecated - public URLProperties getConservation() { - return conservation; - } - - @Deprecated - public DownloadProperties setConservation(URLProperties conservation) { - this.conservation = conservation; - return this; - } - public URLProperties getPhastCons() { return phastCons; } @@ -301,36 +278,6 @@ public DownloadProperties setHgmd(URLProperties hgmd) { return this; } - @Deprecated - public URLProperties getIarctp53() { - return iarctp53; - } - - @Deprecated - public void setIarctp53(URLProperties iarctp53) { - this.iarctp53 = iarctp53; - } - - @Deprecated - public URLProperties getDocm() { - return docm; - } - - @Deprecated - public void setDocm(URLProperties docm) { - this.docm = docm; - } - - @Deprecated - public URLProperties getDocmVersion() { - return docmVersion; - } - - @Deprecated - public void setDocmVersion(URLProperties docmVersion) { - this.docmVersion = docmVersion; - } - public URLProperties getDgv() { return dgv; } @@ -489,19 +436,6 @@ public DownloadProperties setRefSeq(URLProperties refSeq) { return this; } - public URLProperties getRefSeqFasta() { - return refSeqFasta; - } - - public DownloadProperties setRefSeqFasta(URLProperties refSeqFasta) { - this.refSeqFasta = refSeqFasta; - return this; - } - - public URLProperties getRefSeqProteinFasta() { - return refSeqProteinFasta; - } - public URLProperties getRevel() { return revel; } @@ -529,20 +463,6 @@ public DownloadProperties setPharmGKB(URLProperties pharmGKB) { return this; } - public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { - this.refSeqProteinFasta = refSeqProteinFasta; - return this; - } - - public URLProperties getRefSeqCdna() { - return refSeqCdna; - } - - public DownloadProperties setRefSeqCdna(URLProperties refSeqCdna) { - this.refSeqCdna = refSeqCdna; - return this; - } - public URLProperties getLrg() { return lrg; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b7100a18a6..e2f613c500 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -64,10 +64,13 @@ public class EtlCommons { // RefSeq public static final String REFSEQ_NAME = "RefSeq"; + public static final String REFSEQ_DATA = "refseq"; public static final String REFSEQ_VERSION_FILENAME = "refSeq" + SUFFIX_VERSION_FILENAME; -// public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME; -// public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME; -// public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; + public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; + public static final String REFSEQ_PROTEIN_FAA_FILE_ID = "PROTEIN_FAA"; + public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; // MANE Select public static final String MANE_SELECT_NAME = "MANE Select"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 679b9aaa95..1f02a574ef 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -16,17 +16,15 @@ package org.opencb.cellbase.lib.download; -import org.apache.commons.lang.StringUtils; import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -119,60 +117,31 @@ private List downloadEnsemblData(Path geneFolder) throws IOExcepti return downloadFiles; } - private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException { + private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - - logger.info("Downloading RefSeq data ..."); + logger.info("Downloading {} data ...", REFSEQ_NAME); List downloadFiles = new ArrayList<>(); - String timeStamp = getTimeStamp(); - - // gtf - dow - DownloadFile downloadFile = downloadRefSeqFile(REFSEQ_NAME, configuration.getDownload().getRefSeq(), timeStamp, - REFSEQ_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); - - // genomic fasta - downloadFile = downloadRefSeqFile(REFSEQ_NAME + " Fasta", configuration.getDownload().getRefSeqFasta(), timeStamp, - REFSEQ_ASTA_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); - if (StringUtils.isNotEmpty(downloadFile.getOutputFile()) && Paths.get(downloadFile.getOutputFile()).toFile().exists()) { - logger.info("Unzipping file: {}", downloadFile.getOutputFile()); - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(downloadFile.getOutputFile()), null); - } else { - logger.warn("Coud not find the file {} to unzip", downloadFile.getOutputFile()); - } - - // protein fasta - downloadFile = downloadRefSeqFile(REFSEQ_NAME + " Protein Fasta", configuration.getDownload().getRefSeqProteinFasta(), - timeStamp, REFSEQ_PROTEIN_FASTA_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); - + // GTF + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_GTF_FILE_ID, refSeqFolder)); + // Genomic FASTA + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_FNA_FILE_ID, refSeqFolder)); + // Protein FASTA + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_PROTEIN_FAA_FILE_ID, refSeqFolder)); // cDNA - downloadFile = downloadRefSeqFile(REFSEQ_NAME + " cDNA", configuration.getDownload().getRefSeqCdna(), timeStamp, - REFSEQ_CDNA_FASTA_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_RNA_FNA_FILE_ID, refSeqFolder)); + + // Save data source (i.e., metadata) + saveDataSource(REFSEQ_NAME, GENE_DATA, configuration.getDownload().getRefSeq().getVersion(), getTimeStamp(), + downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), + refSeqFolder.resolve(REFSEQ_VERSION_FILENAME)); return downloadFiles; } return Collections.emptyList(); } - private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLProperties urlProperties, String timeStamp, - String versionFilename, Path refSeqFolder) throws IOException, InterruptedException { - String url = urlProperties.getHost(); - String version = urlProperties.getVersion(); - String filename = getFilenameFromUrl(url); - Path outputPath = refSeqFolder.resolve(filename); - saveDataSource(name, EtlCommons.REFSEQ_NAME, version, timeStamp, Collections.singletonList(url), - refSeqFolder.resolve(versionFilename)); - - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); - } - private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading {} ...", MANE_SELECT_NAME); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index df4aa069bf..a6c17809b2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -153,31 +153,31 @@ public List downloadConservation() throws IOException, Interrupted } } - if (speciesConfiguration.getScientificName().equals("Mus musculus")) { - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); - - String url = configuration.getDownload().getConservation().getHost() + "/mm10"; - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "X", "Y", "M", }; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; - downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome - + ".phastCons60way.wigFix.gz").toString())); - phastconsUrls.add(phastConsUrl); - String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; - downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome - + ".phyloP60way.wigFix.gz").toString())); - phyloPUrls.add(phyloPUrl); - } - saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), - getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); - saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), - getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); - } +// if (speciesConfiguration.getScientificName().equals("Mus musculus")) { +// Files.createDirectories(conservationFolder); +// Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); +// Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); +// +// String url = configuration.getDownload().getConservation().getHost() + "/mm10"; +// String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", +// "15", "16", "17", "18", "19", "X", "Y", "M", }; +// List phastconsUrls = new ArrayList<>(chromosomes.length); +// List phyloPUrls = new ArrayList<>(chromosomes.length); +// for (String chromosome : chromosomes) { +// String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; +// downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome +// + ".phastCons60way.wigFix.gz").toString())); +// phastconsUrls.add(phastConsUrl); +// String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; +// downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome +// + ".phyloP60way.wigFix.gz").toString())); +// phyloPUrls.add(phyloPUrl); +// } +// saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), +// getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); +// saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), +// getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); +// } return downloadFiles; } From 1b751de5f80031e42096ba03c86bd383bec00647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 16:10:07 +0200 Subject: [PATCH 025/148] lib: update missense scores (REVEL) downloader, #TASK-5775, #TASK-5564 --- .../src/main/java/org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/MissenseScoresDownloadManager.java | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e2f613c500..34ef38baac 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -151,6 +151,7 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; // Missense variantion functional score + public static final String MISSENSE_VARIATION_SCORE_NAME = "Missense Variation Functional Scores"; public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; // Revel public static final String REVEL_NAME = "Revel"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index 50cf9ee0c0..ca491a97fe 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -44,10 +44,10 @@ public DownloadFile downloadRevel() throws IOException, InterruptedException, Ce if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); Files.createDirectories(missensePredictionScorePath); - logger.info("Downloading Revel data at {} ...", missensePredictionScorePath); - return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_NAME, - MISSENSE_VARIATION_SCORE_DATA, REVEL_FILE_ID, REVEL_VERSION_FILENAME, missensePredictionScorePath); + logger.info("Downloading {}/{} ...", MISSENSE_VARIATION_SCORE_NAME, REVEL_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_NAME, + MISSENSE_VARIATION_SCORE_DATA, REVEL_VERSION_FILENAME, missensePredictionScorePath); } return null; } From b63533324c221b05e1e5615b39b90e6c47de0898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 16:15:58 +0200 Subject: [PATCH 026/148] lib: update CADD and clinical variant downloaders, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/download/CaddDownloadManager.java | 4 ++-- .../opencb/cellbase/lib/download/ClinicalDownloadManager.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index af3ff65baf..738c66f3f1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -46,8 +46,8 @@ public List download() throws IOException, InterruptedException, C // Download CADD and save data source - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_NAME, - VARIATION_FUNCTIONAL_SCORE_DATA, CADD_FILE_ID, CADD_VERSION_FILENAME, variationFunctionalScoreFolder); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_NAME, + VARIATION_FUNCTIONAL_SCORE_DATA, CADD_VERSION_FILENAME, variationFunctionalScoreFolder); return Collections.singletonList(downloadFile); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 1e66f1b5f0..37561b111f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -80,8 +80,8 @@ public List downloadClinical() throws IOException, InterruptedExce clinicalFolder.resolve(HGMD_VERSION_FILENAME)); // GWAS catalog - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_NAME, CLINICAL_VARIANTS_DATA, - GWAS_FILE_ID, GWAS_VERSION_FILENAME, clinicalFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_NAME, + CLINICAL_VARIANTS_DATA, GWAS_VERSION_FILENAME, clinicalFolder); downloadFiles.add(downloadFile); // ClinVar From 106b96d1ff01003187fc9b43e2d80c78556d97e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 16:21:08 +0200 Subject: [PATCH 027/148] lib: update protein downloaders, #TASK-5775, #TASK-5564 --- .../src/main/java/org/opencb/cellbase/lib/EtlCommons.java | 1 + .../cellbase/lib/download/ProteinDownloadManager.java | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 34ef38baac..b31cf14d86 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -286,6 +286,7 @@ public class EtlCommons { public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; // Protein + public static final String PROTEIN_NAME = "Protein"; public static final String PROTEIN_DATA = "protein"; public static final String PROTEIN_SUBDIRECTORY = "protein"; // UniProt diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 519ea828d1..50255a3557 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -51,13 +51,13 @@ public List download() throws IOException, InterruptedException, C } Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); Files.createDirectories(proteinFolder); - logger.info("Downloading protein information at {} ..."); + logger.info("Downloading {} information at {} ...", PROTEIN_NAME, proteinFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Uniprot - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_NAME, PROTEIN_DATA, UNIPROT_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_NAME, PROTEIN_DATA, UNIPROT_VERSION_FILENAME, proteinFolder); Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); @@ -67,12 +67,12 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadFile); // Interpro - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_NAME, PROTEIN_DATA, INTERPRO_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_NAME, PROTEIN_DATA, INTERPRO_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); // Intact - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_NAME, PROTEIN_DATA, INTACT_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_NAME, PROTEIN_DATA, INTACT_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); From 55afe6b8383913f0e6d25d55b08f1bcb5765c049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 19 Apr 2024 11:22:56 +0200 Subject: [PATCH 028/148] lib: update gene downloader (specially for ensembl data), and improve log messages, #TASK-5564 --- .../src/main/resources/configuration.yml | 9 +- .../org/opencb/cellbase/lib/EtlCommons.java | 33 +++-- .../lib/download/AbstractDownloadManager.java | 8 +- .../lib/download/ClinicalDownloadManager.java | 2 +- .../lib/download/GeneDownloadManager.java | 116 +++++++++--------- .../lib/download/GenomeDownloadManager.java | 6 +- .../download/RegulationDownloadManager.java | 7 +- 7 files changed, 101 insertions(+), 80 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 70acaf8776..ffb16393d0 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -55,9 +55,12 @@ download: url: host: https://ftp.ensembl.org/pub/ files: - REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" - MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" - MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_capital_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b31cf14d86..3fa1c6f4ec 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -38,14 +38,19 @@ public class EtlCommons { // Ensembl - public static final String ENSEMBL_NAME = "ENSEMBL"; + public static final String ENSEMBL_NAME = "Ensembl"; + public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; + public static final String PUT_CAPITAL_SPECIES_HERE_MARK = "put_capital_species_here"; public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; // Must match the configuration file - public static final String REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; - public static final String MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; - public static final String MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; + public static final String ENSEMBL_GTF_FILE_ID = "GTF"; + public static final String ENSEMBL_PEP_FA_FILE_ID = "PEP_FA"; + public static final String ENSEMBL_CDNA_FA_FILE_ID = "CDNA_FA"; + public static final String ENSEMBL_REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; + public static final String ENSEMBL_MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; + public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; @@ -149,6 +154,16 @@ public class EtlCommons { public static final String PHARMGKB_DATA = "pharmgkb"; public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String PHARMGKB_GENES_FILE_ID = "GENES"; + public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; + public static final String PHARMGKB_VARIANTS_FILE_ID = "VARIANTS"; + public static final String PHARMGKB_GUIDELINE_ANNOTATIONS_FILE_ID = "GUIDELINE_ANNOTATIONS"; + public static final String PHARMGKB_VARIANT_ANNOTATIONS_FILE_ID = "VARIANT_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_ANNOTATIONS_FILE_ID = "CLINICAL_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_VARIANTS_FILE_ID = "CLINICAL_VARIANTS"; + public static final String PHARMGKB_DRUG_LABELS_FILE_ID = "DRUG_LABELS"; + public static final String PHARMGKB_RELATIONSHIPS_FILE_ID = "RELATIONSHIPS"; // Missense variantion functional score public static final String MISSENSE_VARIATION_SCORE_NAME = "Missense Variation Functional Scores"; @@ -438,11 +453,15 @@ public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, S throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.EnsemblProperties within the CellBase" + " configuration file"); } - String filesValue = props.getUrl().getFiles().get(fileId); - String url = props.getUrl().getHost() + ensemblRelease + "/" + filesValue; - // Change species, assembly, chromosome if necessary + String url = props.getUrl().getHost() + props.getUrl().getFiles().get(fileId); + + // Change release, species, assembly, chromosome if necessary + if (StringUtils.isNotEmpty(ensemblRelease)) { + url = url.replaceAll(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); + } if (StringUtils.isNotEmpty(species)) { url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + url = url.replaceAll(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); } if (StringUtils.isNotEmpty(assembly)) { url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 74ecbe4d4a..f3f01e7c30 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -49,7 +49,9 @@ public abstract class AbstractDownloadManager { - protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} to {} ..."; + protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; + protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done!"; protected String species; protected String assembly; @@ -179,7 +181,7 @@ protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props throws IOException, InterruptedException, CellBaseException { String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); return downloadFile(url, outFile.toString()); } @@ -193,7 +195,7 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); return downloadFile(url, outFile.toString()); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 37561b111f..77f658626a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -92,7 +92,7 @@ public List downloadClinical() throws IOException, InterruptedExce CLINVAR_EFO_TERMS_FILE_ID)) { url = props.getHost() + props.getFiles().get(fileId); outPath = clinicalFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outPath); downloadFiles.add(downloadFile(url, outPath.toString())); urls.add(url); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 1f02a574ef..7ea434c24c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -74,52 +74,30 @@ public List download() throws IOException, InterruptedException, C return downloadFiles; } - private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene Ensembl data (gtf, pep, cdna, motifs) ..."); - List downloadedUrls = new ArrayList<>(4); + private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, ENSEMBL_NAME); + List downloadFiles = new ArrayList<>(); - String ensemblHost = ensemblHostUrl + "/" + ensemblRelease; - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - ensemblHost = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } + // GTF + downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_GTF_FILE_ID, geneFolder)); + // PEP + downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PEP_FA_FILE_ID, geneFolder)); + // CDNA + downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_CDNA_FA_FILE_ID, geneFolder)); - String ensemblCollection = ""; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - ensemblCollection = speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - - // Ensembl leaves now several GTF files in the FTP folder, we need to build a more accurate URL - // to download the correct GTF file. - String version = ensemblRelease.split("-")[1]; - String url = ensemblHost + "/gtf/" + ensemblCollection + speciesShortName + "/*" + version + ".gtf.gz"; - String fileName = geneFolder.resolve(speciesShortName + ".gtf.gz").toString(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + ensemblCollection + speciesShortName + "/pep/*.pep.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".pep.all.fa.gz").toString(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + ensemblCollection + speciesShortName + "/cdna/*.cdna.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".cdna.all.fa.gz").toString(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - downloadedUrls.add(url); - - saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, + // Save data source (i.e., metadata) + saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), + downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), geneFolder.resolve(ENSEMBL_CORE_VERSION_FILENAME)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ENSEMBL_NAME); return downloadFiles; } private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} data ...", REFSEQ_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, REFSEQ_NAME); List downloadFiles = new ArrayList<>(); @@ -137,6 +115,7 @@ private List downloadRefSeq(Path refSeqFolder) throws IOException, downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), refSeqFolder.resolve(REFSEQ_VERSION_FILENAME)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REFSEQ_NAME); return downloadFiles; } return Collections.emptyList(); @@ -144,66 +123,80 @@ private List downloadRefSeq(Path refSeqFolder) throws IOException, private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", MANE_SELECT_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, MANE_SELECT_NAME, GENE_DATA, - MANE_SELECT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, MANE_SELECT_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, + MANE_SELECT_NAME, GENE_DATA, MANE_SELECT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, MANE_SELECT_NAME); + return downloadFile; } return null; } private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", LRG_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, LRG_VERSION_FILENAME, - geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, LRG_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, + LRG_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, LRG_NAME); + return downloadFile; } return null; } private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", HGNC_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, + logger.info(DOWNLOADING_LOG_MESSAGE, HGNC_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, HGNC_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, HGNC_NAME); + return downloadFile; } return null; } private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", CANCER_HOTSPOT_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, CANCER_HOTSPOT_NAME, - GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, + CANCER_HOTSPOT_NAME, GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + return downloadFile; } return null; } private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", DGIDB_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, GENE_DATA, - DGIDB_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, DGIDB_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, + GENE_DATA, DGIDB_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, DGIDB_NAME); + return downloadFile; } return null; } private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - logger.info("Downloading {} ...", UNIPROT_XREF_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, UNIPROT_XREF_NAME, - GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, UNIPROT_XREF_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, + UNIPROT_XREF_NAME, GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, UNIPROT_XREF_NAME); + return downloadFile; } return null; } private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", GENE_EXPRESSION_ATLAS_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), GENE_EXPRESSION_ATLAS_FILE_ID, - GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), + GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + return downloadFile; } private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", GENE_DISEASE_ANNOTATION_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); // IMPORTANT !!! logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", @@ -211,13 +204,16 @@ private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOExc saveDataSource(HPO_NAME, GENE_DISEASE_ANNOTATION_NAME, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), Collections.singletonList(configuration.getDownload().getHpo().getHost()), geneFolder.resolve(HPO_VERSION_FILENAME)); - return downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, GENE_DISEASE_ANNOTATION_NAME, DISGENET_VERSION_FILENAME, geneFolder); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); + return downloadFile; } private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", GNOMAD_CONSTRAINTS_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GNOMAD_CONSTRAINTS_NAME); return downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, GNOMAD_CONSTRAINTS_NAME, GENE_DATA, GNOMAD_CONSTRAINTS_VERSION_FILENAME, geneFolder); } @@ -226,7 +222,7 @@ private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOExcepti private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", GO_ANNOTATION_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GO_ANNOTATION_NAME); return downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, GO_ANNOTATION_NAME, GENE_DATA, GO_ANNOTATION_VERSION_FILENAME, geneFolder); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index a6c17809b2..1ef4e66ae0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -206,7 +206,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); // Download genomic super duplications @@ -216,7 +216,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); // Download WindowMasker @@ -227,7 +227,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); } return downloadFiles; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 26ed4776da..d11e907aa0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -73,17 +73,18 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept List downloadFiles = new ArrayList<>(); // Regulatory build - downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), REGULATORY_BUILD_FILE_ID, + downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_REGULATORY_BUILD_FILE_ID, REGULATORY_BUILD_NAME, REGULATION_DATA, null, REGULATORY_BUILD_VERSION_FILENAME, regulationFolder); downloadFiles.add(downloadFile); // Motifs features List urls = new ArrayList<>(); - downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_FILE_ID, null, regulationFolder); + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_FILE_ID, null, + regulationFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // And now the index file - downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_INDEX_FILE_ID, null, + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID, null, regulationFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); From 88c2b17614fe7af59db9372b00da2d15c6ccfc45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 10:32:24 +0200 Subject: [PATCH 029/148] core: add Ensembl primary fasta URL into the configuration file for the genome downloader, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index ffb16393d0..5022340bec 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -55,6 +55,8 @@ download: url: host: https://ftp.ensembl.org/pub/ files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" From eee13e30826bde6774910adec721f35977a783bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 10:34:44 +0200 Subject: [PATCH 030/148] lib: update genome download manager by declaring and using constants from the class EtlCommand and improve log messages, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 9 +- .../cellbase/lib/download/Downloader.java | 2 +- .../lib/download/GenomeDownloadManager.java | 95 ++++++------------- 3 files changed, 36 insertions(+), 70 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3fa1c6f4ec..3a98939a23 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -45,6 +45,7 @@ public class EtlCommons { public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; // Must match the configuration file + public static final String ENSEMBL_PRIMARY_FA_FILE_ID = "PRIMARY_FA"; public static final String ENSEMBL_GTF_FILE_ID = "GTF"; public static final String ENSEMBL_PEP_FA_FILE_ID = "PEP_FA"; public static final String ENSEMBL_CDNA_FA_FILE_ID = "CDNA_FA"; @@ -61,9 +62,13 @@ public class EtlCommons { public static final String SUFFIX_VERSION_FILENAME = "Version.json"; + // Genome (Ensembl) + public static final String GENOME_NAME = "Genome"; public static final String GENOME_DATA = "genome"; + public static final String GENOME_SUBDIRECTORY = "genome"; public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME; + // Gene (Ensembl) public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; @@ -201,7 +206,8 @@ public class EtlCommons { // Must match the configuration file public static final String GWAS_FILE_ID = "GWAS"; - public static final String STRUCTURAL_VARIANTS_DATA = "svs"; + // Repeats + public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; public static final String REPEATS_SUBDIRECTORY = "genome"; public static final String REPEATS_JSON = "repeats"; @@ -322,6 +328,7 @@ public class EtlCommons { public static final String INTACT_FILE_ID = "INTACT"; // Conservation scores + public static final String CONSERVATION_NAME = "Conservation"; public static final String CONSERVATION_DATA = "conservation"; public static final String CONSERVATION_SUBDIRECTORY = "conservation"; // GERP diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 17022cae4b..d72d077e4a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -40,7 +40,7 @@ public Downloader(String species, String assembly, Path outputDirectory, CellBas public List downloadGenome() throws IOException, CellBaseException, InterruptedException { GenomeDownloadManager manager = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); + return manager.downloadReferenceGenome(); } public List downloadGene() throws IOException, CellBaseException, InterruptedException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 1ef4e66ae0..f36f493e1f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -17,7 +17,6 @@ package org.opencb.cellbase.lib.download; import com.beust.jcommander.ParameterException; -import org.apache.commons.lang.StringUtils; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; @@ -40,46 +39,27 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto } @Override - public List download() throws IOException, InterruptedException { - List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadReferenceGenome()); - downloadFiles.addAll(downloadConservation()); - downloadFiles.addAll(downloadRepeats()); - - return downloadFiles; + public List download() throws IOException, InterruptedException, CellBaseException { + return downloadReferenceGenome(); } - public List downloadReferenceGenome() throws IOException, InterruptedException { - logger.info("Downloading genome information ..."); - Path sequenceFolder = downloadFolder.resolve("genome"); + public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, GENOME_NAME); + Path sequenceFolder = downloadFolder.resolve(GENOME_SUBDIRECTORY); Files.createDirectories(sequenceFolder); // Reference genome sequences are downloaded from Ensembl // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead - String url = ensemblHostUrl + "/" + ensemblRelease; - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - url = url + "/fasta/" + speciesShortName + "/dna/*.dna.primary_assembly.fa.gz"; - } else { - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - url = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } - url = url + "/fasta/"; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - url = url + speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - url = url + speciesShortName + "/dna/*.dna.toplevel.fa.gz"; - } + DownloadFile downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PRIMARY_FA_FILE_ID, + sequenceFolder); - String outputFileName = StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName() + ".fa.gz"; - Path outputPath = sequenceFolder.resolve(outputFileName); - logger.info("Saving reference genome version data at {}", sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + // Save data source saveDataSource(ENSEMBL_NAME, EtlCommons.GENOME_DATA, ensemblVersion, getTimeStamp(), - Collections.singletonList(url), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); - List downloadFiles = Collections.singletonList(downloadFile(url, outputPath.toString())); - logger.info("Unzipping file: {}", outputFileName); - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null); - return downloadFiles; + Collections.singletonList(downloadFile.getUrl()), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENOME_NAME); + + return Collections.singletonList(downloadFile); } /** @@ -89,13 +69,14 @@ public List downloadReferenceGenome() throws IOException, Interrup * @throws InterruptedException if there is an error downloading files */ public List downloadConservation() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "conservation")) { + if (!speciesHasInfoToDownload(speciesConfiguration, CONSERVATION_DATA)) { return Collections.emptyList(); } - logger.info("Downloading conservation information ..."); - Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); List downloadFiles = new ArrayList<>(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info(DOWNLOADING_LOG_MESSAGE, CONSERVATION_NAME); + Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); + Files.createDirectories(conservationFolder); Files.createDirectories(conservationFolder.resolve(GERP_SUBDIRECTORY)); Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); @@ -111,7 +92,7 @@ public List downloadConservation() throws IOException, Interrupted List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); // Downloading PhastCons and PhyloP - logger.info("Downloading {} and {}", PHASTCONS_NAME, PHYLOP_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, (PHASTCONS_NAME + "/" + PHYLOP_NAME)); for (String chromosome : chromosomes) { // PhastCons String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() @@ -119,7 +100,7 @@ public List downloadConservation() throws IOException, Interrupted .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phastConsUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve(filename); - logger.info("Downloading {} from {} to {}", PHASTCONS_NAME, phastConsUrl, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); phastconsUrls.add(phastConsUrl); @@ -129,18 +110,18 @@ public List downloadConservation() throws IOException, Interrupted .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phyloPUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHYLOP_SUBDIRECTORY).resolve(filename); - logger.info("Downloading {} from {} to {}", PHYLOP_NAME, phyloPUrl, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); phyloPUrls.add(phyloPUrl); } // Downloading Gerp - logger.info("Downloading {}", GERP_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GERP_NAME); String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() .get(GERP_FILE_ID); filename = Paths.get(gerpUrl).getFileName().toString(); outputPath = conservationFolder.resolve(GERP_SUBDIRECTORY).resolve(filename); - logger.info("Downloading from {} to {}", gerpUrl, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); // Save data version @@ -151,42 +132,18 @@ public List downloadConservation() throws IOException, Interrupted saveDataSource(GERP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), Collections.singletonList(gerpUrl), conservationFolder.resolve(GERP_VERSION_FILENAME)); } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CONSERVATION_NAME); } -// if (speciesConfiguration.getScientificName().equals("Mus musculus")) { -// Files.createDirectories(conservationFolder); -// Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); -// Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); -// -// String url = configuration.getDownload().getConservation().getHost() + "/mm10"; -// String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", -// "15", "16", "17", "18", "19", "X", "Y", "M", }; -// List phastconsUrls = new ArrayList<>(chromosomes.length); -// List phyloPUrls = new ArrayList<>(chromosomes.length); -// for (String chromosome : chromosomes) { -// String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; -// downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome -// + ".phastCons60way.wigFix.gz").toString())); -// phastconsUrls.add(phastConsUrl); -// String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; -// downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome -// + ".phyloP60way.wigFix.gz").toString())); -// phyloPUrls.add(phyloPUrl); -// } -// saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), -// getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); -// saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), -// getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); -// } return downloadFiles; } public List downloadRepeats() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "repeats")) { + if (!speciesHasInfoToDownload(speciesConfiguration, REPEATS_DATA)) { return Collections.emptyList(); } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading repeats data ..."); + logger.info(DOWNLOADING_LOG_MESSAGE, REPEATS_NAME); Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); Files.createDirectories(repeatsFolder); List downloadFiles = new ArrayList<>(); @@ -230,6 +187,8 @@ public List downloadRepeats() throws IOException, InterruptedExcep logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REPEATS_NAME); + return downloadFiles; } return Collections.emptyList(); From cd367b998dd0d3905e5a0162145b864a8b42857e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 12:32:55 +0200 Subject: [PATCH 031/148] app: update genome builder by using constants from the class EtlCommons, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 110 ++++++++++++++---- 1 file changed, 88 insertions(+), 22 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 71b20e8b5a..482a6b5693 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -17,11 +17,14 @@ package org.opencb.cellbase.app.cli.admin.executors; import com.beust.jcommander.ParameterException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.apache.commons.lang.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -33,12 +36,16 @@ import java.io.File; import java.io.IOException; -import java.nio.file.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.core.utils.SpeciesUtils.getSpeciesShortname; import static org.opencb.cellbase.lib.EtlCommons.*; /** @@ -52,6 +59,10 @@ public class BuildCommandExecutor extends CommandExecutor { private Path downloadFolder = null; // /_/download private boolean normalize = true; + private SpeciesConfiguration.Assembly assembly; + private String ensemblVersion; + private String ensemblRelease; + private File ensemblScriptsFolder; private boolean flexibleGTFParsing; @@ -83,7 +94,7 @@ public void execute() { if (speciesConfiguration == null) { throw new CellBaseException("Invalid species: '" + buildCommandOptions.species + "'"); } - SpeciesConfiguration.Assembly assembly = null; + if (!StringUtils.isEmpty(buildCommandOptions.assembly)) { assembly = SpeciesUtils.getAssembly(speciesConfiguration, buildCommandOptions.assembly); if (assembly == null) { @@ -93,7 +104,10 @@ public void execute() { assembly = SpeciesUtils.getDefaultAssembly(speciesConfiguration); } - String spShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); + ensemblVersion = assembly.getEnsemblVersion(); + ensemblRelease = "release-" + ensemblVersion.split("_")[0]; + + String spShortName = getSpeciesShortname(speciesConfiguration); String spAssembly = assembly.getName().toLowerCase(); Path spFolder = output.resolve(spShortName + "_" + spAssembly); // /_/download @@ -121,9 +135,6 @@ public void execute() { logger.info("Building '{}' data", buildOption); CellBaseBuilder parser = null; switch (buildOption) { -// case EtlCommons.GENOME_INFO_DATA: -// buildGenomeInfo(); -// break; case EtlCommons.GENOME_DATA: parser = buildGenomeSequence(); break; @@ -250,11 +261,17 @@ private void copyVersionFiles(List pathList) { // } // } - private CellBaseBuilder buildGenomeSequence() { - copyVersionFiles(Collections.singletonList(downloadFolder.resolve("genome/genomeVersion.json"))); - Path fastaFile = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "genome_sequence"); - return new GenomeSequenceFastaBuilder(fastaFile, serializer); + private CellBaseBuilder buildGenomeSequence() throws CellBaseException { + // Sanity check + Path genomeVersionPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(GENOME_VERSION_FILENAME); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_SUBDIRECTORY)); + + // Get FASTA path + Path fastaPath = getFastaReferenceGenome(); + + // Create serializer and return the genome builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_SUBDIRECTORY), GENOME_DATA); + return new GenomeSequenceFastaBuilder(fastaPath, serializer); } private CellBaseBuilder buildGene() throws CellBaseException { @@ -381,19 +398,26 @@ private String getDefaultHumanAssembly() { + "configuration file. No hsapiens data found within the configuration.json file"); } - private Path getFastaReferenceGenome() { - Path fastaFile = null; - try { - DirectoryStream stream = Files.newDirectoryStream(downloadFolder.resolve("genome"), entry -> { - return entry.toString().endsWith(".fa"); - }); - for (Path entry : stream) { - fastaFile = entry; + private Path getFastaReferenceGenome() throws CellBaseException { + // Check FASTA and unzip if necessary + String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, + getSpeciesShortname(speciesConfiguration), assembly.getName(), null); + String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); + Path fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename); + if (fastaPath.toFile().exists()) { + // Gunzip + logger.info("Gunzip file: " + fastaPath); + try { + EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null); + } catch (IOException | InterruptedException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } - } catch (IOException e) { - e.printStackTrace(); } - return fastaFile; + fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename.replace(".gz", "")); + if (!fastaPath.toFile().exists()) { + throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); + } + return fastaPath; } private CellBaseBuilder buildSplice() throws IOException { @@ -448,4 +472,46 @@ private CellBaseBuilder buildPharmacogenomics() throws IOException { CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); return new PharmGKBBuilder(inFolder, serializer); } + + private void checkVersionFiles(List versionPaths) throws CellBaseException { + ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + for (Path versionPath : versionPaths) { + if (!versionPath.toFile().exists()) { + throw new CellBaseException("Version file " + versionPath + " does not exist: this file is mandatory for version control"); + } + try { + DataSource dataSource = dataSourceReader.readValue(versionPath.toFile()); + if (org.apache.commons.lang3.StringUtils.isEmpty(dataSource.getVersion())) { + throw new CellBaseException("Version missing version in file " + versionPath + ": a version must be specified in the" + + " file"); + } + } catch (IOException e) { + throw new CellBaseException("Error parsing the version file " + versionPath, e); + } + } + } + + private void copyVersionFiles(List versionPaths, Path targetPath) throws CellBaseException { + // Check version files before copying them + checkVersionFiles(versionPaths); + if (!targetPath.toFile().exists()) { + try { + Files.createDirectories(targetPath); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + targetPath, e); + } + } + + for (Path versionPath : versionPaths) { + try { + Files.copy(versionPath, targetPath.resolve(versionPath.getFileName()), StandardCopyOption.REPLACE_EXISTING); + } catch (IOException e) { + throw new CellBaseException("Error copying version file " + versionPath + " to " + targetPath, e); + } + // Sanity check after copying + if (!targetPath.resolve(versionPath.getFileName()).toFile().exists()) { + throw new CellBaseException("Something wrong happened when copying version file " + versionPath + " to " + targetPath); + } + } + } } From ce6f8d5e4a2e20df5f4e43269a2d9cfdefc366bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 13:08:22 +0200 Subject: [PATCH 032/148] app: fix sonnar issues in BuildCommandExecutor, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 120 ++++-------------- 1 file changed, 27 insertions(+), 93 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 482a6b5693..1f99975a4b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -34,7 +34,6 @@ import org.opencb.cellbase.lib.builders.*; import org.opencb.cellbase.lib.builders.clinical.variant.ClinicalVariantBuilder; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -60,11 +59,8 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean normalize = true; private SpeciesConfiguration.Assembly assembly; - private String ensemblVersion; private String ensemblRelease; - private File ensemblScriptsFolder; - private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; @@ -75,15 +71,16 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma this.output = Paths.get(buildCommandOptions.outputDirectory); normalize = !buildCommandOptions.skipNormalize; - this.ensemblScriptsFolder = new File(System.getProperty("basedir") + "/bin/ensembl-scripts/"); this.flexibleGTFParsing = buildCommandOptions.flexibleGTFParsing; } - /** * Parse specific 'build' command options. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { + String buildOption = null; try { // Output directory need to be created if it doesn't exist if (!Files.exists(output)) { @@ -104,7 +101,7 @@ public void execute() { assembly = SpeciesUtils.getDefaultAssembly(speciesConfiguration); } - ensemblVersion = assembly.getEnsemblVersion(); + String ensemblVersion = assembly.getEnsemblVersion(); ensemblRelease = "release-" + ensemblVersion.split("_")[0]; String spShortName = getSpeciesShortname(speciesConfiguration); @@ -130,9 +127,8 @@ public void execute() { } for (int i = 0; i < buildOptions.length; i++) { - String buildOption = buildOptions[i]; + buildOption = buildOptions[i]; - logger.info("Building '{}' data", buildOption); CellBaseBuilder parser = null; switch (buildOption) { case EtlCommons.GENOME_DATA: @@ -156,9 +152,6 @@ public void execute() { case EtlCommons.PROTEIN_DATA: parser = buildProtein(); break; -// case EtlCommons.PPI_DATA: -// parser = getInteractionParser(); -// break; case EtlCommons.CONSERVATION_DATA: parser = buildConservation(); break; @@ -181,24 +174,26 @@ public void execute() { parser = buildPharmacogenomics(); break; default: - logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); + logger.error("Build option '{}' is not valid", buildCommandOptions.data); break; } if (parser != null) { - try { - parser.parse(); - } catch (Exception e) { - logger.error("Error executing 'build' command " + buildCommandOptions.data + ": " + e.getMessage(), e); - } + logger.info("Building '{}' data ...", buildOption); + parser.parse(); + logger.info("Building '{}' data. Done.", buildOption); parser.disconnect(); } } } } catch (ParameterException e) { logger.error("Error parsing build command line parameters: " + e.getMessage(), e); - } catch (IOException | CellBaseException e) { - logger.error(e.getMessage()); + } catch (Exception e) { + String msg = "Error executing the command 'build'."; + if (StringUtils.isNotEmpty(buildOption)) { + msg += " It was building the data '" + buildOption + "'"; + } + throw new CellBaseException(msg, e); } } @@ -207,7 +202,6 @@ private CellBaseBuilder buildRepeats() { copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON); return new RepeatsBuilder(repeatsFilesDir, serializer); } @@ -223,44 +217,11 @@ private void copyVersionFiles(List pathList) { try { Files.copy(path, downloadFolder.resolve(path.getFileName()), StandardCopyOption.REPLACE_EXISTING); } catch (IOException e) { - logger.warn("Version file {} not found - skipping", path.toString()); + logger.warn("Version file {} not found - skipping", path); } } } -// private void buildGenomeInfo() { -// /** -// * To get some extra info about the genome such as chromosome length or cytobands -// * we execute the following script. -// */ -// try { -// String outputFileName = downloadFolder.resolve("genome_info.json").toAbsolutePath().toString(); -// List args = new ArrayList<>(); -// args.addAll(Arrays.asList("--species", speciesConfigurathtion.getScientificName(), -// "--assembly", buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, -// "-o", outputFileName, -// "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs())); -// if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration) -// && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) { -// args.add("--phylo"); -// args.add("no-vertebrate"); -// } -// -// String geneInfoLogFileName = downloadFolder.resolve("genome_info.log").toAbsolutePath().toString(); -// -// boolean downloadedGenomeInfo; -// downloadedGenomeInfo = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, "./genome_info.pl", args, geneInfoLogFileName); -// -// if (downloadedGenomeInfo) { -// logger.info(outputFileName + " created OK"); -// } else { -// logger.error("Genome info for " + speciesConfiguration.getScientificName() + " cannot be downloaded"); -// } -// } catch (IOException | InterruptedException e) { -// e.printStackTrace(); -// } -// } - private CellBaseBuilder buildGenomeSequence() throws CellBaseException { // Sanity check Path genomeVersionPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(GENOME_VERSION_FILENAME); @@ -316,42 +277,12 @@ private CellBaseBuilder buildRegulation() { } private CellBaseBuilder buildProtein() { - Path proteinFolder = downloadFolder.resolve("protein"); + Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); copyVersionFiles(Arrays.asList(proteinFolder.resolve("uniprotVersion.json"), proteinFolder.resolve("interproVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein"); - return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), - downloadFolder.resolve("protein").resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); - } - - private void getProteinFunctionPredictionMatrices(SpeciesConfiguration sp, Path geneFolder) - throws IOException, InterruptedException { - logger.info("Downloading protein function prediction matrices ..."); - - // run protein_function_prediction_matrices.pl - String proteinFunctionProcessLogFile = geneFolder.resolve("protein_function_prediction_matrices.log").toString(); - List args = Arrays.asList("--species", sp.getScientificName(), "--outdir", geneFolder.toString(), - "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs()); - - boolean proteinFunctionPredictionMatricesObtaines = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, - "./protein_function_prediction_matrices.pl", - args, - proteinFunctionProcessLogFile); - - // check output - if (proteinFunctionPredictionMatricesObtaines) { - logger.info("Protein function prediction matrices created OK"); - } else { - logger.error("Protein function prediction matrices for " + sp.getScientificName() + " cannot be downloaded"); - } - } - - private CellBaseBuilder getInteractionParser() { - Path proteinFolder = downloadFolder.resolve("protein"); - Path psimiTabFile = proteinFolder.resolve("intact.txt"); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("intactVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein_protein_interaction"); - return new InteractionBuilder(psimiTabFile, speciesConfiguration.getScientificName(), serializer); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, PROTEIN_DATA); + return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), downloadFolder.resolve(PROTEIN_SUBDIRECTORY) + .resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); } private CellBaseBuilder buildConservation() { @@ -359,7 +290,6 @@ private CellBaseBuilder buildConservation() { copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"), conservationFilesDir.resolve("phastConsVersion.json"), conservationFilesDir.resolve("phyloPVersion.json"))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder); return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer); @@ -406,10 +336,14 @@ private Path getFastaReferenceGenome() throws CellBaseException { Path fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename); if (fastaPath.toFile().exists()) { // Gunzip - logger.info("Gunzip file: " + fastaPath); + logger.info("Gunzip file: {}", fastaPath); try { EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null); - } catch (IOException | InterruptedException e) { + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } } From 3566e011aaddff926f4a8320648559b386b822e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 14:01:21 +0200 Subject: [PATCH 033/148] app: improve log/exception messages in DownloadCommandExecutor, #TASK-5564 --- .../app/cli/admin/executors/DownloadCommandExecutor.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8d3e04eb9..8a763ae3c9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -17,7 +17,6 @@ package org.opencb.cellbase.app.cli.admin.executors; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.exception.CellBaseException; @@ -25,7 +24,6 @@ import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.cellbase.lib.download.Downloader; -import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -109,12 +107,12 @@ public void execute() throws CellBaseException { } } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); - } catch (IOException | NoSuchMethodException | FileFormatException e) { - throw new CellBaseException("Error executing command line 'download'", e); } catch (InterruptedException e) { // Restore interrupted state... Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing command line 'download'", e); + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); + } catch (Exception e) { + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } From cd94452ba578e658c6cbd2cce158fdaae8215bca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 14:02:11 +0200 Subject: [PATCH 034/148] app: update repeats builder, and improve log/exception messages, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 184 ++++++++++-------- 1 file changed, 107 insertions(+), 77 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 1f99975a4b..6f9f531e47 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -19,7 +19,7 @@ import com.beust.jcommander.ParameterException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; @@ -64,6 +64,10 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; + private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, REFSEQ_DATA, + VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, + CLINICAL_VARIANTS_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -82,6 +86,9 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma public void execute() throws CellBaseException { String buildOption = null; try { + // Check data sources + List dataList = checkDataSources(); + // Output directory need to be created if it doesn't exist if (!Files.exists(output)) { Files.createDirectories(output); @@ -118,92 +125,84 @@ public void execute() throws CellBaseException { makeDir(buildFolder); } - if (buildCommandOptions.data != null) { - String[] buildOptions; - if (buildCommandOptions.data.equals("all")) { - buildOptions = speciesConfiguration.getData().toArray(new String[0]); - } else { - buildOptions = buildCommandOptions.data.split(","); + for (String data : dataList) { + CellBaseBuilder parser; + switch (data) { + case GENOME_DATA: + parser = buildGenomeSequence(); + break; + case GENE_DATA: + parser = buildGene(); + break; + case REFSEQ_DATA: + parser = buildRefSeq(); + break; + case VARIATION_FUNCTIONAL_SCORE_DATA: + parser = buildCadd(); + break; + case MISSENSE_VARIATION_SCORE_DATA: + parser = buildRevel(); + break; + case REGULATION_DATA: + parser = buildRegulation(); + break; + case PROTEIN_DATA: + parser = buildProtein(); + break; + case CONSERVATION_DATA: + parser = buildConservation(); + break; + case CLINICAL_VARIANTS_DATA: + parser = buildClinicalVariants(); + break; + case REPEATS_DATA: + parser = buildRepeats(); + break; + case ONTOLOGY_DATA: + parser = buildObo(); + break; + case SPLICE_SCORE_DATA: + parser = buildSplice(); + break; + case PUBMED_DATA: + parser = buildPubMed(); + break; + case PHARMACOGENOMICS_DATA: + parser = buildPharmacogenomics(); + break; + default: + throw new IllegalArgumentException("Value '" + buildOption + "' is not allowed for the data parameter." + + " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build" + + " everything"); } - for (int i = 0; i < buildOptions.length; i++) { - buildOption = buildOptions[i]; - - CellBaseBuilder parser = null; - switch (buildOption) { - case EtlCommons.GENOME_DATA: - parser = buildGenomeSequence(); - break; - case EtlCommons.GENE_DATA: - parser = buildGene(); - break; - case EtlCommons.REFSEQ_DATA: - parser = buildRefSeq(); - break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: - parser = buildCadd(); - break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - parser = buildRevel(); - break; - case EtlCommons.REGULATION_DATA: - parser = buildRegulation(); - break; - case EtlCommons.PROTEIN_DATA: - parser = buildProtein(); - break; - case EtlCommons.CONSERVATION_DATA: - parser = buildConservation(); - break; - case EtlCommons.CLINICAL_VARIANTS_DATA: - parser = buildClinicalVariants(); - break; - case EtlCommons.REPEATS_DATA: - parser = buildRepeats(); - break; - case ONTOLOGY_DATA: - parser = buildObo(); - break; - case EtlCommons.SPLICE_SCORE_DATA: - parser = buildSplice(); - break; - case EtlCommons.PUBMED_DATA: - parser = buildPubMed(); - break; - case EtlCommons.PHARMACOGENOMICS_DATA: - parser = buildPharmacogenomics(); - break; - default: - logger.error("Build option '{}' is not valid", buildCommandOptions.data); - break; - } - - if (parser != null) { - logger.info("Building '{}' data ...", buildOption); - parser.parse(); - logger.info("Building '{}' data. Done.", buildOption); - parser.disconnect(); - } + if (parser != null) { + logger.info("Building '{}' data ...", buildOption); + parser.parse(); + logger.info("Building '{}' data. Done.", buildOption); + parser.disconnect(); } } - } catch (ParameterException e) { - logger.error("Error parsing build command line parameters: " + e.getMessage(), e); } catch (Exception e) { - String msg = "Error executing the command 'build'."; + String msg = "Error executing the command 'build'"; if (StringUtils.isNotEmpty(buildOption)) { - msg += " It was building the data '" + buildOption + "'"; + msg += ". The last data being built was '" + buildOption + "'"; } - throw new CellBaseException(msg, e); + throw new CellBaseException(msg + ": " + e.getMessage(), e); } } - private CellBaseBuilder buildRepeats() { - Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON); - return new RepeatsBuilder(repeatsFilesDir, serializer); + private CellBaseBuilder buildRepeats() throws CellBaseException { + // Sanity check + Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_SUBDIRECTORY); + List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(TRF_VERSION_FILENAME), + repeatsDownloadPath.resolve(GSD_VERSION_FILENAME), + repeatsDownloadPath.resolve(WM_VERSION_FILENAME)); + copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_SUBDIRECTORY)); + + // Create serializer and return the repeats builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA); + return new RepeatsBuilder(repeatsDownloadPath, serializer); } private CellBaseBuilder buildObo() { @@ -448,4 +447,35 @@ private void copyVersionFiles(List versionPaths, Path targetPath) throws C } } } + + private List checkDataSources() { + if (StringUtils.isEmpty(buildCommandOptions.data)) { + throw new IllegalArgumentException("Missing data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to download everything"); + } + List dataList = Arrays.asList(buildCommandOptions.data.split(",")); + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + case GENE_DATA: + case REFSEQ_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: + case REGULATION_DATA: + case PROTEIN_DATA: + case CONSERVATION_DATA: + case CLINICAL_VARIANTS_DATA: + case REPEATS_DATA: + case ONTOLOGY_DATA: + case SPLICE_SCORE_DATA: + case PUBMED_DATA: + case PHARMACOGENOMICS_DATA: + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build everything"); + } + } + return dataList; + } } From 148814fdc5ac2a25f80cccd47e38e24d712e7631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 15:35:13 +0200 Subject: [PATCH 035/148] lib: update the repeats builder by removing the hardcoded filenames and taking them from the configuration file; update JUnit test and improve log messages, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 19 ++--- .../org/opencb/cellbase/lib/EtlCommons.java | 13 +--- .../lib/builders/CellBaseBuilder.java | 4 + .../cellbase/lib/builders/RepeatsBuilder.java | 77 ++++++++++++------- .../lib/builders/RepeatsBuilderTest.java | 4 +- .../test/resources/configuration.test.yaml | 17 +++- 6 files changed, 84 insertions(+), 50 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 6f9f531e47..5b03fd510e 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -84,7 +84,7 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma * @throws CellBaseException Exception */ public void execute() throws CellBaseException { - String buildOption = null; + String data = null; try { // Check data sources List dataList = checkDataSources(); @@ -125,8 +125,9 @@ public void execute() throws CellBaseException { makeDir(buildFolder); } - for (String data : dataList) { - CellBaseBuilder parser; + CellBaseBuilder parser; + for (int i = 0; i < dataList.size(); i++) { + data = dataList.get(i); switch (data) { case GENOME_DATA: parser = buildGenomeSequence(); @@ -171,22 +172,22 @@ public void execute() throws CellBaseException { parser = buildPharmacogenomics(); break; default: - throw new IllegalArgumentException("Value '" + buildOption + "' is not allowed for the data parameter." + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter." + " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build" + " everything"); } if (parser != null) { - logger.info("Building '{}' data ...", buildOption); + logger.info(CellBaseBuilder.BUILDING_LOG_MESSAGE, data); parser.parse(); - logger.info("Building '{}' data. Done.", buildOption); + logger.info(CellBaseBuilder.BUILDING_DONE_LOG_MESSAGE, data); parser.disconnect(); } } } catch (Exception e) { String msg = "Error executing the command 'build'"; - if (StringUtils.isNotEmpty(buildOption)) { - msg += ". The last data being built was '" + buildOption + "'"; + if (StringUtils.isNotEmpty(data)) { + msg += ". The last data being built was '" + data + "'"; } throw new CellBaseException(msg + ": " + e.getMessage(), e); } @@ -202,7 +203,7 @@ private CellBaseBuilder buildRepeats() throws CellBaseException { // Create serializer and return the repeats builder CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA); - return new RepeatsBuilder(repeatsDownloadPath, serializer); + return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } private CellBaseBuilder buildObo() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3a98939a23..4370d0f203 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -65,8 +65,8 @@ public class EtlCommons { // Genome (Ensembl) public static final String GENOME_NAME = "Genome"; public static final String GENOME_DATA = "genome"; - public static final String GENOME_SUBDIRECTORY = "genome"; - public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME; + public static final String GENOME_SUBDIRECTORY = GENOME_DATA; + public static final String GENOME_VERSION_FILENAME = GENOME_DATA + SUFFIX_VERSION_FILENAME; // Gene (Ensembl) public static final String GENE_DATA = "gene"; @@ -209,24 +209,19 @@ public class EtlCommons { // Repeats public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; - public static final String REPEATS_SUBDIRECTORY = "genome"; + public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY; + @Deprecated public static final String REPEATS_JSON = "repeats"; // Simple repeats public static final String TRF_NAME = "Tandem Repeats Finder"; - @Deprecated - public static final String TRF_FILE = "simpleRepeat.txt.gz"; public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; // Genomic super duplications public static final String GSD_NAME = "Genomic Super Duplications"; - @Deprecated - public static final String GSD_FILE = "genomicSuperDups.txt.gz"; public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; // Window masker public static final String WM_NAME = "Window Masker"; - @Deprecated - public static final String WM_FILE = "windowmaskerSdust.txt.gz"; public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 79e5b7e58b..9dc95f8d83 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -29,6 +29,10 @@ public abstract class CellBaseBuilder { protected Logger logger; + public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done!"; + + public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index d37765e0b6..6cefc0266f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -18,6 +18,8 @@ import org.opencb.biodata.models.core.Region; import org.opencb.biodata.models.variant.avro.Repeat; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.commons.ProgressLogger; @@ -27,55 +29,74 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; + +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by fjlopez on 05/05/17. */ public class RepeatsBuilder extends CellBaseBuilder { - private static final String TRF = "trf"; - private static final String GSD = "genomicSuperDup"; - private static final String WM = "windowMasker"; + + private CellBaseConfiguration configuration; + private final Path filesDir; - public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer) { + public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); this.filesDir = filesDir; + this.configuration = configuration; } @Override public void parse() throws Exception { + logger.info(BUILDING_LOG_MESSAGE, EtlCommons.REPEATS_NAME); - logger.info("Parsing repeats..."); - if (Files.exists(filesDir.resolve(EtlCommons.TRF_FILE))) { - parseTrfFile(filesDir.resolve(EtlCommons.TRF_FILE)); - } else { - logger.warn("No TRF file found {}", EtlCommons.TRF_FILE); - logger.warn("Skipping TRF file parsing. TRF data models will not be built."); + // Check Simple Repeats (TRF) filename + String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(trfFilename))) { + throw new CellBaseException(TRF_NAME + " file " + trfFilename + " does not exist at " + filesDir); } - if (Files.exists(filesDir.resolve(EtlCommons.GSD_FILE))) { - parseGsdFile(filesDir.resolve(EtlCommons.GSD_FILE)); - } else { - logger.warn("No Genomic Super Duplications file found {}", EtlCommons.GSD_FILE); - logger.warn("Skipping Genomic Super Duplications file parsing. " - + "Genomic Super Duplications data models will not be built."); + // Check Genomic Super Duplications (GSD) file + String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) + .getFileName().toString(); + if (!Files.exists(filesDir.resolve(gsdFilename))) { + throw new CellBaseException(GSD_NAME + " file " + gsdFilename + " does not exist at " + filesDir); } - if (Files.exists(filesDir.resolve(EtlCommons.WM_FILE))) { - parseWmFile(filesDir.resolve(EtlCommons.WM_FILE)); - } else { - logger.warn("No WindowMasker file found {}", EtlCommons.WM_FILE); - logger.warn("Skipping WindowMasker file parsing. WindowMasker data models will not be built."); + // Check Window Masker (WM) file + String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(wmFilename))) { + throw new CellBaseException(WM_NAME + " file " + wmFilename + " does not exist at " + filesDir); } - logger.info("Done."); + + // Parse TRF file + logger.info(BUILDING_LOG_MESSAGE, TRF_NAME); + parseTrfFile(filesDir.resolve(trfFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, TRF_NAME); + + // Parse GSD file + logger.info(BUILDING_LOG_MESSAGE, GSD_NAME); + parseGsdFile(filesDir.resolve(gsdFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, GSD_NAME); + + // Parse WM file + logger.info(BUILDING_LOG_MESSAGE, WM_NAME); + parseWmFile(filesDir.resolve(wmFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, WM_NAME); + + logger.info(BUILDING_DONE_LOG_MESSAGE, EtlCommons.REPEATS_NAME); } private void parseTrfFile(Path filePath) throws IOException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed TRF lines:", + ProgressLogger progressLogger = new ProgressLogger("Parsed " + TRF_NAME + " lines:", () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseTrfLine(line)); @@ -90,14 +111,14 @@ private Repeat parseTrfLine(String line) { return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]), - Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF); + Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_NAME); } private void parseGsdFile(Path filePath) throws IOException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed GSD lines:", + ProgressLogger progressLogger = new ProgressLogger("Parsed " + GSD_NAME + " lines:", () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseGSDLine(line)); @@ -112,7 +133,7 @@ private Repeat parseGSDLine(String line) { return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null, - null, GSD); + null, GSD_NAME); } @@ -120,7 +141,7 @@ private void parseWmFile(Path filePath) throws IOException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed WM lines:", + ProgressLogger progressLogger = new ProgressLogger("Parsed " + WM_NAME + " lines:", () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseWmLine(line)); @@ -134,6 +155,6 @@ private Repeat parseWmLine(String line) { String[] parts = line.split("\t"); return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]), - Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM); + Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_NAME); } } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java index 9c69a1e602..acce1fa92b 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.variant.avro.Repeat; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.commons.utils.FileUtils; @@ -46,9 +47,10 @@ public RepeatsBuilderTest() { @Test public void testParse() throws Exception { + CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml")); Path repeatsFilesDir = Paths.get(getClass().getResource("/repeats").getPath()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "repeats.test"); - (new RepeatsBuilder(repeatsFilesDir, serializer)).parse(); + (new RepeatsBuilder(repeatsFilesDir, serializer, configuration)).parse(); serializer.close(); assertEquals(loadRepeatSet(Paths.get(getClass().getResource("/repeats/repeats.test.json.gz").getFile())), loadRepeatSet(Paths.get("/tmp/repeats.test.json.gz"))); diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index 1322d2fa52..fd7a1498f8 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -85,12 +85,23 @@ download: host: http://docm.genome.wustl.edu dgv: host: http://dgv.tcag.ca/v106/docs + simpleRepeats: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz windowMasker: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz genomicSuperDups: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + gwasCatalog: host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2016/09/28/gwas-catalog-associations.tsv hpo: From 30a4c87dfbbb6f84518757a75284dc9ac1d49aaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 17:30:47 +0200 Subject: [PATCH 036/148] lib: update conservation builder by removing the hardcoded filenames and taking them from the version files (i.e., URLs of the DataSource), improve log/exception messages, and fix sonnar issues, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 18 +- .../org/opencb/cellbase/lib/EtlCommons.java | 14 +- .../lib/builders/CellBaseBuilder.java | 34 +- .../lib/builders/ConservationBuilder.java | 441 ++++++++---------- .../lib/builders/ConservationBuilderTest.java | 3 + 5 files changed, 245 insertions(+), 265 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 5b03fd510e..6e44db2a75 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -178,9 +178,7 @@ public void execute() throws CellBaseException { } if (parser != null) { - logger.info(CellBaseBuilder.BUILDING_LOG_MESSAGE, data); parser.parse(); - logger.info(CellBaseBuilder.BUILDING_DONE_LOG_MESSAGE, data); parser.disconnect(); } } @@ -285,14 +283,16 @@ private CellBaseBuilder buildProtein() { .resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); } - private CellBaseBuilder buildConservation() { - Path conservationFilesDir = downloadFolder.resolve("conservation"); - copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"), - conservationFilesDir.resolve("phastConsVersion.json"), - conservationFilesDir.resolve("phyloPVersion.json"))); + private CellBaseBuilder buildConservation() throws CellBaseException { + // Sanity check + Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_VERSION_FILENAME), + conservationDownloadPath.resolve(PHASTCONS_VERSION_FILENAME), conservationDownloadPath.resolve(PHYLOP_VERSION_FILENAME)), + buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder); - return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } private CellBaseBuilder buildClinicalVariants() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 4370d0f203..11a01b7a8b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -333,13 +333,15 @@ public class EtlCommons { public static final String GERP_FILE_ID = "GERP"; // PHASTCONS public static final String PHASTCONS_NAME = "PhastCons"; - public static final String PHASTCONS_SUBDIRECTORY = "phastCons"; - public static final String PHASTCONS_VERSION_FILENAME = "phastCons" + SUFFIX_VERSION_FILENAME; + public static final String PHASTCONS_DATA = "phastCons"; + public static final String PHASTCONS_SUBDIRECTORY = PHASTCONS_DATA; + public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_DATA + SUFFIX_VERSION_FILENAME; public static final String PHASTCONS_FILE_ID = "PHASTCONS"; // PHYLOP public static final String PHYLOP_NAME = "PhyloP"; - public static final String PHYLOP_SUBDIRECTORY = "phylop"; - public static final String PHYLOP_VERSION_FILENAME = "phylop" + SUFFIX_VERSION_FILENAME; + public static final String PHYLOP_DATA = "phylop"; + public static final String PHYLOP_SUBDIRECTORY = PHYLOP_DATA; + public static final String PHYLOP_VERSION_FILENAME = PHYLOP_DATA + SUFFIX_VERSION_FILENAME; public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores @@ -502,4 +504,8 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI } return url; } + + public static String getFilename(String prefix, String chromosome) { + return prefix + "_" + chromosome; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 9dc95f8d83..3efe5d1388 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -16,10 +16,19 @@ package org.opencb.cellbase.lib.builders; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + /** * Created by imedina on 30/08/14. */ @@ -30,7 +39,10 @@ public abstract class CellBaseBuilder { protected Logger logger; public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; - public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done!"; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done."; + + public static final String PARSING_LOG_MESSAGE = "Parsing file {} ..."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing file {} done."; public CellBaseBuilder(CellBaseSerializer serializer) { @@ -50,4 +62,24 @@ public void disconnect() { } } + protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { + logger.info("Checking {} folder and files", name); + if (!targetPath.toFile().exists()) { + throw new CellBaseException(name + " folder does not exist " + targetPath); + } + + List files = new ArrayList<>(); + + List filenames = dataSource.getUrls().stream().map(u -> Paths.get(u).getFileName().toString()).collect(Collectors.toList()); + for (String filename : filenames) { + File file = targetPath.resolve(filename).toFile(); + if (!file.exists()) { + throw new CellBaseException("File " + file + " does not exits"); + } else { + files.add(file); + } + } + + return files; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 9247b78faa..3aa9e2bb91 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -16,25 +16,26 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; import org.opencb.commons.utils.FileUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; -import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class ConservationBuilder extends CellBaseBuilder { - private Logger logger; private Path conservedRegionPath; private int chunkSize; @@ -50,326 +51,259 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile fileSerializer = serializer; this.conservedRegionPath = conservedRegionPath; this.chunkSize = chunkSize; - logger = LoggerFactory.getLogger(ConservationBuilder.class); outputFileNames = new HashMap<>(); } @Override public void parse() throws IOException, CellBaseException { - System.out.println("conservedRegionPath = " + conservedRegionPath.toString()); if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { - throw new IOException("Conservation directory does not exist, is not a directory or cannot be read"); + throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + + " be read"); } - /* - * GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse - * this file correctly, so we transform the file into a bedGraph format which is human readable. - */ - Path gerpFolderPath = conservedRegionPath.resolve(EtlCommons.GERP_SUBDIRECTORY); - if (gerpFolderPath.toFile().exists()) { - logger.debug("Parsing GERP data ..."); - gerpParser(gerpFolderPath); - } else { - logger.debug("GERP data not found: " + gerpFolderPath.toString()); + ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + + // Check GERP folder and files + Path gerpPath = conservedRegionPath.resolve(GERP_SUBDIRECTORY); + List gerpFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(GERP_VERSION_FILENAME).toFile()), gerpPath, + GERP_NAME); + + // Check PhastCons folder and files + Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_SUBDIRECTORY); + List phastConsFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHASTCONS_VERSION_FILENAME).toFile()), + phastConsPath, PHASTCONS_NAME); + + // Check PhyloP folder and files + Path phylopPath = conservedRegionPath.resolve(PHYLOP_SUBDIRECTORY); + List phylopFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHYLOP_VERSION_FILENAME).toFile()), + phylopPath, PHYLOP_NAME); + + // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse + // this file correctly, so we transform the file into a bedGraph format which is human-readable. + if (gerpFiles.size() != 1) { + throw new CellBaseException("Only one " + GERP_NAME + " file is expected, but currently there are " + gerpFiles.size() + + " files"); } + gerpParser(gerpFiles.get(0).toPath()); - /* - * UCSC phastCons and phylop are stored in the same format. They are processed together. - */ + // UCSC phastCons and phylop are stored in the same format. They are processed together. Map files = new HashMap<>(); String chromosome; Set chromosomes = new HashSet<>(); - // Reading all files in phastCons folder - DirectoryStream directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phastCons"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhastCons filenames + for (File file : phastConsFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phastCons", path); + files.put(chromosome + PHASTCONS_DATA, file.toPath()); } - // Reading all files in phylop folder - directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phylop"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhyloP filenames + for (File file : phylopFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phylop", path); + files.put(chromosome + PHYLOP_DATA, file.toPath()); } - /* - * Now we can iterate over all the chromosomes found and process the files - */ - logger.debug("Chromosomes found '{}'", chromosomes.toString()); + // Now we can iterate over all the chromosomes found and process the files + logger.debug("Chromosomes found '{}'", chromosomes); for (String chr : chromosomes) { - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phastCons")); - processWigFixFile(files.get(chr + "phastCons"), "phastCons"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHASTCONS_DATA)); + processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_NAME); - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phylop")); - processWigFixFile(files.get(chr + "phylop"), "phylop"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); + processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_NAME); } } - private void gerpParser(Path gerpFolderPath) throws IOException, CellBaseException { - Path gerpProcessFilePath = gerpFolderPath.resolve(EtlCommons.GERP_PROCESSED_FILE); - logger.info("parsing {}", gerpProcessFilePath); - BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath); - - String line; - int startOfBatch = 0; - int previousEndValue = 0; - String chromosome = null; - String previousChromosomeValue = null; - - List conservationScores = new ArrayList<>(chunkSize); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - - // file is wrong. throw an exception instead? - if (fields.length != 4) { - logger.error("skipping invalid line: " + line.length()); - continue; - } + private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, gerpProcessFilePath); - chromosome = fields[0]; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath)) { + String line; + int startOfBatch = 0; + int previousEndValue = 0; + String chromosome = null; + String previousChromosomeValue = null; - // new chromosome, store batch - if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { - storeScores(startOfBatch, previousChromosomeValue, conservationScores); + List conservationScores = new ArrayList<>(chunkSize); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); - // reset values for current batch - startOfBatch = 0; - } + // Checking line + if (fields.length != 4) { + throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + line); + } - // reset chromosome for next entry - previousChromosomeValue = chromosome; + chromosome = fields[0]; - // file is american! starts at zero, add one - int start = Integer.parseInt(fields[1]) + 1; - // inclusive - int end = Integer.parseInt(fields[2]) + 1; + // New chromosome, store batch + if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { + storeScores(startOfBatch, previousChromosomeValue, conservationScores); - // start coordinate for this batch of 2,000 - if (startOfBatch == 0) { - startOfBatch = start; - previousEndValue = 0; - } + // Reset values for current batch + startOfBatch = 0; + } - // if there is a gap between the last entry and this one. - if (previousEndValue != 0 && (start - previousEndValue) != 0) { - // gap is too big! store what we already have before processing more - if (start - previousEndValue >= chunkSize) { - // we have a full batch, store - storeScores(startOfBatch, chromosome, conservationScores); + // Reset chromosome for next entry + previousChromosomeValue = chromosome; - // reset batch to start at this record + // File is american! starts at zero, add one + int start = Integer.parseInt(fields[1]) + 1; + // Inclusive + int end = Integer.parseInt(fields[2]) + 1; + + // sSart coordinate for this batch of 2,000 + if (startOfBatch == 0) { startOfBatch = start; - } else { - // fill in the gap with zeroes - // don't overfill the batch - while (previousEndValue < start && conservationScores.size() < chunkSize) { - conservationScores.add((float) 0); - previousEndValue++; + previousEndValue = 0; + } + + // If there is a gap between the last entry and this one + if (previousEndValue != 0 && (start - previousEndValue) != 0) { + // Gap is too big! store what we already have before processing more + if (start - previousEndValue >= chunkSize) { + // We have a full batch, store + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset batch to start at this record + startOfBatch = start; + } else { + // Fill in the gap with zeroes, don't overfill the batch + while (previousEndValue < start && conservationScores.size() < chunkSize) { + conservationScores.add((float) 0); + previousEndValue++; + } + + // We have a full batch, store + if (conservationScores.size() == chunkSize) { + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset: start a new batch + startOfBatch = start; + } } + } + + // Reset value + previousEndValue = end; + + // Score for these coordinates + String score = fields[3]; - // we have a full batch, store + // Add the score for each coordinate included in the range start-end + while (start < end) { + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch + // Reset: start a new batch startOfBatch = start; } - } - } - // reset value - previousEndValue = end; + // Add score to batch + conservationScores.add(Float.valueOf(score)); - // score for these coordinates - String score = fields[3]; + // Increment coordinate + start++; + } - // add the score for each coordinate included in the range start-end - while (start < end) { - // we have a full batch, store + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch - startOfBatch = start; + // Reset: start a new batch + startOfBatch = 0; } - - // add score to batch - conservationScores.add(Float.valueOf(score)); - - // increment coordinate - start++; } - - // we have a full batch, store - if (conservationScores.size() == chunkSize) { + // We need to serialize the last chunk that might be incomplete + if (!conservationScores.isEmpty()) { storeScores(startOfBatch, chromosome, conservationScores); - - // reset, start a new batch - startOfBatch = 0; } } - // we need to serialize the last chunk that might be incomplete - if (!conservationScores.isEmpty()) { - storeScores(startOfBatch, chromosome, conservationScores); - } - bufferedReader.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gerpProcessFilePath); } private void storeScores(int startOfBatch, String chromosome, List conservationScores) throws CellBaseException { - // if this is a small batch, fill in the missing coordinates with 0 + // If this is a small batch, fill in the missing coordinates with 0 while (conservationScores.size() < chunkSize) { conservationScores.add((float) 0); } if (conservationScores.size() != chunkSize) { - throw new CellBaseException("invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); + throw new CellBaseException("Invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); } - GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion(chromosome, startOfBatch, - startOfBatch + conservationScores.size() - 1, "gerp", conservationScores); + GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion<>(chromosome, startOfBatch, + startOfBatch + conservationScores.size() - 1, GERP_NAME, conservationScores); fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome)); - // reset + // Reset conservationScores.clear(); } -// @Deprecated -// private void gerpParser(Path gerpFolderPath) throws IOException, InterruptedException { -// logger.info("Uncompressing {}", gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// List tarArgs = Arrays.asList("-xvzf", gerpFolderPath.resolve(EtlCommons.GERP_FILE).toString(), -// "--overwrite", "-C", gerpFolderPath.toString()); -// EtlCommons.runCommandLineProcess(null, "tar", tarArgs, null); -// -// DirectoryStream pathDirectoryStream = Files.newDirectoryStream(gerpFolderPath, "*.rates"); -// boolean filesFound = false; -// for (Path path : pathDirectoryStream) { -// filesFound = true; -// logger.info("Processing file '{}'", path.getFileName().toString()); -// String[] chromosome = path.getFileName().toString().replaceFirst("chr", "").split("\\."); -// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(String.valueOf(path)))); -// String line; -// int start = 1; -// int end = 1999; -// int counter = 1; -// String[] fields; -// List val = new ArrayList<>(chunkSize); -// while ((line = bufferedReader.readLine()) != null) { -// fields = line.split("\t"); -// val.add(Float.valueOf(fields[1])); -// counter++; -// if (counter == chunkSize) { -//// ConservationScoreRegion conservationScoreRegion = new ConservationScoreRegion(chromosome[0], start, end, "gerp", -// val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, end, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// start = end + 1; -// end += chunkSize; -// -// counter = 0; -// val.clear(); -// } -// } -// -// // we need to serialize the last chunk that might be incomplete -//// ConservationScoreRegion conservationScoreRegion = -//// new ConservationScoreRegion(chromosome[0], start, start + val.size() - 1, "gerp", val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, start + val.size() - 1, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// bufferedReader.close(); -// } -// -// if (!filesFound) { -// logger.warn("No GERP++ files were found. Please check that the original file {} is there, that it was" -// + " properly decompressed and that the *.rates files are present", -// gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// } -// } - private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { - BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath); - - String line; - String chromosome = ""; -// int start = 0, end = 0; - int start = 0; - float value; - Map attributes = new HashMap<>(); -// ConservedRegion conservedRegion = null; - List values = new ArrayList<>(); -// ConservationScoreRegion conservedRegion = null; - GenomicScoreRegion conservedRegion = null; - - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("fixedStep")) { - //new group, save last - if (conservedRegion != null) { -// conservedRegion.setEnd(end); -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - } + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { + + String line; + String chromosome = ""; + int start = 0; + float value; + Map attributes = new HashMap<>(); + List values = new ArrayList<>(); + GenomicScoreRegion conservedRegion = null; + + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("fixedStep")) { + // New group, save last + if (conservedRegion != null) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, + conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } -// offset = 0; - attributes.clear(); - String[] attrFields = line.split(" "); - String[] attrKeyValue; - for (String attrField : attrFields) { - if (!attrField.equalsIgnoreCase("fixedStep")) { - attrKeyValue = attrField.split("="); - attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + attributes.clear(); + String[] attrFields = line.split(" "); + String[] attrKeyValue; + for (String attrField : attrFields) { + if (!attrField.equalsIgnoreCase("fixedStep")) { + attrKeyValue = attrField.split("="); + attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + } } - } - chromosome = formatChromosome(attributes); - start = Integer.parseInt(attributes.get("start")); -// end = Integer.parseInt(attributes.get("start")); - - values = new ArrayList<>(2000); - } else { - int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; -// end++; - int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - // This is the endChunk if current read score is - // appended to the array (otherwise it would be - // start + values.size() - 1). If this endChunk is - // different from the startChunk means that current - // conserved region must be dumped and current - // score must be associated to next chunk. Main - // difference to what there was before is that if - // the fixedStep starts on the last position of a - // chunk e.g. 1999, the chunk must be created with - // just that score - the chunk was left empty with - // the old code - if (startChunk != endChunk) { -// conservedRegion = new ConservationScoreRegion(chromosome, start, end - 1, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - start = start + values.size(); - values.clear(); - } + chromosome = formatChromosome(attributes); + start = Integer.parseInt(attributes.get("start")); - value = Float.parseFloat(line.trim()); - values.add(value); + values = new ArrayList<>(2000); + } else { + int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + // This is the endChunk if current read score is appended to the array (otherwise it would be start + values.size() + // - 1). If this endChunk is different from the startChunk means that current conserved region must be dumped and + // current score must be associated to next chunk. Main difference to what there was before is that if the fixedStep + // starts on the last position of a chunk e.g. 1999, the chunk must be created with just that score - the chunk was + // left empty with the old code + if (startChunk != endChunk) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, + values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + start = start + values.size(); + values.clear(); + } + + value = Float.parseFloat(line.trim()); + values.add(value); + } } + + // Write last + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); } - //write last -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, - values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - bufferedReader.close(); } private String getOutputFileName(String chromosome) { @@ -379,13 +313,18 @@ private String getOutputFileName(String chromosome) { } String outputFileName = outputFileNames.get(chromosome); if (outputFileName == null) { - outputFileName = "conservation_" + chromosome; + outputFileName = getFilename(CONSERVATION_DATA, chromosome); outputFileNames.put(chromosome, outputFileName); } return outputFileName; } - // phylop and phastcons list the chromosome as M instead of the standard MT. replace. + /** + * Remove chr from the chromosome name; and phylop and phastcons list the chromosome as M instead of the standard MT, replace it. + * + * @param attributes Attributes map with the chromosome name + * @return The new chromosome name + */ private String formatChromosome(Map attributes) { String chromosome = attributes.get("chrom").replace("chr", ""); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java index 5af6cbd7e9..6a21908c13 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java @@ -23,6 +23,7 @@ import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.biodata.models.variant.avro.Repeat; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.commons.utils.FileUtils; @@ -41,6 +42,8 @@ public class ConservationBuilderTest { @Test public void testParse() throws Exception { + CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml")); + Path conservationDir = Paths.get(ConservationBuilderTest.class.getResource("/conservation").toURI()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gerp.test"); (new ConservationBuilder(conservationDir, BATCH_SIZE, serializer)).parse(); From 85e17db92499b4325958330d53831a0177277fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Apr 2024 13:49:25 +0200 Subject: [PATCH 037/148] lib: call bigWigToBedGraph to convert the GERP bigwig to bed graph file, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 22 +++++++++++- .../lib/builders/ConservationBuilder.java | 34 +++++++++++++++++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 11a01b7a8b..e7f53fb687 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -28,6 +28,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -393,7 +394,7 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); + logger.info("Executing command: " + StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); @@ -508,4 +509,23 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI public static String getFilename(String prefix, String chromosome) { return prefix + "_" + chromosome; } + + public static boolean isExecutableAvailable(String executable) throws IOException, InterruptedException { + ProcessBuilder processBuilder = new ProcessBuilder("which", executable); + Process process = processBuilder.start(); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) { + String line; + StringBuilder output = new StringBuilder(); + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + int exitCode = process.waitFor(); + + // if exitCode is 0 then the executable is installed at + output.toString().trim()), + // otherwise, it's not + return (exitCode == 0); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 3aa9e2bb91..4014fdccdb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -22,6 +22,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; import org.opencb.commons.utils.FileUtils; @@ -30,6 +31,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -56,6 +58,8 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile @Override public void parse() throws IOException, CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, CONSERVATION_NAME); + if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + " be read"); @@ -84,7 +88,30 @@ public void parse() throws IOException, CellBaseException { throw new CellBaseException("Only one " + GERP_NAME + " file is expected, but currently there are " + gerpFiles.size() + " files"); } - gerpParser(gerpFiles.get(0).toPath()); + File bigwigFile = gerpFiles.get(0); + File bedgraphFile = Paths.get(gerpFiles.get(0).getAbsolutePath() + ".bedgraph").toFile(); + String exec = "bigWigToBedGraph"; + if (!bedgraphFile.exists()) { + try { + if (isExecutableAvailable(exec)) { + EtlCommons.runCommandLineProcess(null, exec, Arrays.asList(bigwigFile.toString(), bedgraphFile.toString()), null); + } else { + throw new CellBaseException(exec + " not found in your system, install it to build " + GERP_NAME + ". It is available" + + " at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); + } + } catch (IOException e) { + throw new CellBaseException("Error executing " + exec + " in BIGWIG file " + bigwigFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("" + e.getMessage(), e); + } + if (!bedgraphFile.exists()) { + throw new CellBaseException("Something happened when executing " + exec + " in BIGWIG file " + bigwigFile + "; the BED" + + " graph file was not generated. Please, check " + exec); + } + } + gerpParser(bedgraphFile.toPath()); // UCSC phastCons and phylop are stored in the same format. They are processed together. Map files = new HashMap<>(); @@ -114,6 +141,8 @@ public void parse() throws IOException, CellBaseException { logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_NAME); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, CONSERVATION_NAME); } private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { @@ -132,7 +161,8 @@ private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseEx // Checking line if (fields.length != 4) { - throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + line); + throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + fields.length + " items: " + + line); } chromosome = fields[0]; From 0223cb5ca43bfbde6f0ac71e2220cd75dd3f524e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Apr 2024 17:57:40 +0200 Subject: [PATCH 038/148] lib: include log messages, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/ConservationBuilder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 4014fdccdb..ca34cfd2d7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -276,6 +276,7 @@ private void storeScores(int startOfBatch, String chromosome, List conser } private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { + logger.info(PARSING_LOG_MESSAGE, inGzPath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { String line; @@ -334,6 +335,7 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); } + logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath); } private String getOutputFileName(String chromosome) { From 833c3371fc29e24deb98e8dcb7e20309251f3a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Apr 2024 19:09:01 +0200 Subject: [PATCH 039/148] lib: improve ProteinBuilder by removing hardcoded file names, adding checks and log messages, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 21 ++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 13 +++++ .../cellbase/lib/builders/ProteinBuilder.java | 57 +++++++++++-------- .../lib/download/AbstractDownloadManager.java | 14 +---- 4 files changed, 62 insertions(+), 43 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 6e44db2a75..04e5b928a3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -210,6 +210,7 @@ private CellBaseBuilder buildObo() { return new OntologyBuilder(oboDir, serializer); } + @Deprecated private void copyVersionFiles(List pathList) { for (Path path : pathList) { try { @@ -274,13 +275,19 @@ private CellBaseBuilder buildRegulation() { return new RegulatoryFeatureBuilder(regulatoryRegionFilesDir, serializer); } - private CellBaseBuilder buildProtein() { - Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("uniprotVersion.json"), - proteinFolder.resolve("interproVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, PROTEIN_DATA); - return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), downloadFolder.resolve(PROTEIN_SUBDIRECTORY) - .resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); + private CellBaseBuilder buildProtein() throws CellBaseException { + // Sanity check + Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); + Path proteinBuildPath = buildFolder.resolve(PROTEIN_SUBDIRECTORY); + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(UNIPROT_VERSION_FILENAME), + proteinDownloadPath.resolve(INTERPRO_VERSION_FILENAME)), proteinBuildPath); + + // Create the file serializer and the protein builder + Path chunksPath = proteinDownloadPath.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); + return new ProteinBuilder(chunksPath, proteinDownloadPath.resolve(uniprotFilename), speciesConfiguration.getScientificName(), + serializer); } private CellBaseBuilder buildConservation() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e7f53fb687..f0fbcd1702 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; @@ -528,4 +529,16 @@ public static boolean isExecutableAvailable(String executable) throws IOExceptio // otherwise, it's not return (exitCode == 0); } + + public static String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + return getFilenameFromUrl(props.getFiles().get(fileId)); + } + + public static String getFilenameFromUrl(String url) { + return Paths.get(url).getFileName().toString(); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 0369a0e6aa..3dc6f04212 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.protein.uniprot.UniProtParser; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.*; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; @@ -42,14 +43,14 @@ import java.util.Map; import java.util.Set; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class ProteinBuilder extends CellBaseBuilder { private Path uniprotFilesDir; private Path interproFilePath; private String species; - private Map proteinMap; - protected Logger logger = LoggerFactory.getLogger(this.getClass()); public ProteinBuilder(Path uniprotFilesDir, String species, CellBaseSerializer serializer) { @@ -65,23 +66,33 @@ public ProteinBuilder(Path uniprotFilesDir, Path interproFilePath, String specie } @Override - public void parse() throws IOException { + public void parse() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, PROTEIN_NAME); + // Check UniProt if (uniprotFilesDir == null || !Files.exists(uniprotFilesDir)) { - throw new IOException("File '" + uniprotFilesDir + "' not valid"); + throw new CellBaseException("Could not build " + UNIPROT_NAME + ": folder " + uniprotFilesDir + " does not exist"); + } + + // Check InterPro + if (interproFilePath != null && Files.exists(interproFilePath)) { + throw new CellBaseException("Could not build " + INTERPRO_NAME + ": file " + interproFilePath + " does not exist"); } + // Prepare RocksDB RocksDB rocksDb = getDBConnection(); ObjectMapper mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = mapper.writerFor(Entry.class); - proteinMap = new HashMap<>(30000); -// UniProtParser up = new UniProtParser(); + Map proteinMap = new HashMap<>(30000); + + // Parsing files try { File[] files = uniprotFilesDir.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); for (File file : files) { + logger.info(PARSING_LOG_MESSAGE, file); Uniprot uniprot = (Uniprot) UniProtParser.loadXMLInfo(file.toString(), UniProtParser.UNIPROT_CONTEXT); for (Entry entry : uniprot.getEntry()) { @@ -89,16 +100,16 @@ public void parse() throws IOException { for (OrganismNameType organismNameType : entry.getOrganism().getName()) { entryOrganism = organismNameType.getValue(); if (entryOrganism.equals(species)) { -// proteinMap.put(entry.getAccession().get(0), entry); rocksDb.put(entry.getAccession().get(0).getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); } } } + logger.info(PARSING_DONE_LOG_MESSAGE, file); } logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); - if (interproFilePath != null && Files.exists(interproFilePath)) { - BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath); + logger.info(PARSING_LOG_MESSAGE, interproFilePath); + try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath)) { Set hashSet = new HashSet<>(proteinMap.keySet()); Set visited = new HashSet<>(30000); @@ -114,7 +125,6 @@ public void parse() throws IOException { iprAdded = false; BigInteger start = BigInteger.valueOf(Integer.parseInt(fields[4])); BigInteger end = BigInteger.valueOf(Integer.parseInt(fields[5])); -// for (FeatureType featureType : proteinMap.get(fields[0]).getFeature()) { byte[] bytes = rocksDb.get(fields[0].getBytes()); Entry entry = mapper.readValue(bytes, Entry.class); for (FeatureType featureType : entry.getFeature()) { @@ -145,7 +155,6 @@ public void parse() throws IOException { locationType.setEnd(positionType2); featureType.setLocation(locationType); -// proteinMap.get(fields[0]).getFeature().add(featureType); bytes = rocksDb.get(fields[0].getBytes()); entry = mapper.readValue(bytes, Entry.class); entry.getFeature().add(featureType); @@ -158,11 +167,13 @@ public void parse() throws IOException { } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} InterPro lines processed. {} unique proteins processed", - numInterProLinesProcessed, numUniqueProteinsProcessed); + logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, INTERPRO_NAME, + numUniqueProteinsProcessed); } } - interproBuffereReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, interproFilePath); + } catch (IOException e) { + throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interproFilePath, e); } // Serialize and save results @@ -173,24 +184,22 @@ public void parse() throws IOException { } rocksDb.close(); - } catch (JAXBException | RocksDBException e) { - e.printStackTrace(); + } catch (JAXBException | RocksDBException | IOException e) { + throw new CellBaseException("Error parsing " + PROTEIN_NAME + " files", e); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, PROTEIN_NAME); } - private RocksDB getDBConnection() { - // a static method that loads the RocksDB C++ library. + private RocksDB getDBConnection() throws CellBaseException { + // A static method that loads the RocksDB C++ library RocksDB.loadLibrary(); - // the Options class contains a set of configurable DB options - // that determines the behavior of a database. + // The Options class contains a set of configurable DB options that determines the behavior of a database Options options = new Options().setCreateIfMissing(true); try { return RocksDB.open(options, uniprotFilesDir.resolve("integration.idx").toString()); } catch (RocksDBException e) { - // do some error handling - e.printStackTrace(); - System.exit(1); + throw new CellBaseException("Error preparing RocksDB", e); } - return null; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index f3f01e7c30..7cf171e7dd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -47,6 +47,8 @@ import java.time.LocalDateTime; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.getFilenameFromUrl; + public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; @@ -353,18 +355,6 @@ protected String getUrl(DownloadProperties.URLProperties props, String fileId) t return props.getHost() + filesValue; } } - - protected String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { - if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); - } - return getFilenameFromUrl(props.getFiles().get(fileId)); - } - - protected String getFilenameFromUrl(String url) { - return Paths.get(url).getFileName().toString(); - } } From 01deb0c1bd08354c6c081e2d2dbd8a6d826dc2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 09:43:30 +0200 Subject: [PATCH 040/148] lib: move DataSource reader from ConservationBuilder to the parent CellBaseBuilder to be used by other builders, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/CellBaseBuilder.java | 3 +++ .../opencb/cellbase/lib/builders/ConservationBuilder.java | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 3efe5d1388..49d847c033 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -16,6 +16,8 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -35,6 +37,7 @@ public abstract class CellBaseBuilder { protected CellBaseSerializer serializer; + protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); protected Logger logger; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index ca34cfd2d7..79099a4d93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -16,11 +16,8 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectReader; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; @@ -65,8 +62,6 @@ public void parse() throws IOException, CellBaseException { + " be read"); } - ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); - // Check GERP folder and files Path gerpPath = conservedRegionPath.resolve(GERP_SUBDIRECTORY); List gerpFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(GERP_VERSION_FILENAME).toFile()), gerpPath, From 9416894717b9cff285fa6736822f990edee9d2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 11:40:51 +0200 Subject: [PATCH 041/148] lib: move the function to split UniProt into chuncks from the protein downloader to the protein builder, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 5 +- .../cellbase/lib/builders/ProteinBuilder.java | 119 +++++++++++++----- .../lib/download/ProteinDownloadManager.java | 60 ++------- 3 files changed, 99 insertions(+), 85 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 04e5b928a3..3d3b9d9d37 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -283,11 +283,8 @@ private CellBaseBuilder buildProtein() throws CellBaseException { proteinDownloadPath.resolve(INTERPRO_VERSION_FILENAME)), proteinBuildPath); // Create the file serializer and the protein builder - Path chunksPath = proteinDownloadPath.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); - return new ProteinBuilder(chunksPath, proteinDownloadPath.resolve(uniprotFilename), speciesConfiguration.getScientificName(), - serializer); + return new ProteinBuilder(proteinDownloadPath, speciesConfiguration.getScientificName(), serializer); } private CellBaseBuilder buildConservation() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 3dc6f04212..eb4c04a909 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -35,52 +35,67 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; import java.math.BigInteger; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; import static org.opencb.cellbase.lib.EtlCommons.*; public class ProteinBuilder extends CellBaseBuilder { - private Path uniprotFilesDir; - private Path interproFilePath; + private Path proteinPath; private String species; protected Logger logger = LoggerFactory.getLogger(this.getClass()); - public ProteinBuilder(Path uniprotFilesDir, String species, CellBaseSerializer serializer) { - this(uniprotFilesDir, null, species, serializer); - } - - public ProteinBuilder(Path uniprotFilesDir, Path interproFilePath, String species, CellBaseSerializer serializer) { + public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer serializer) { super(serializer); - this.uniprotFilesDir = uniprotFilesDir; - this.interproFilePath = interproFilePath; + this.proteinPath = proteinPath; this.species = species; } @Override - public void parse() throws CellBaseException { + public void parse() throws CellBaseException, IOException { logger.info(BUILDING_LOG_MESSAGE, PROTEIN_NAME); - // Check UniProt - if (uniprotFilesDir == null || !Files.exists(uniprotFilesDir)) { - throw new CellBaseException("Could not build " + UNIPROT_NAME + ": folder " + uniprotFilesDir + " does not exist"); + // Sanity check + if (proteinPath == null) { + throw new CellBaseException(PROTEIN_NAME + " directory is missing (null)"); + } + if (!Files.exists(proteinPath)) { + throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " does not exist"); + } + if (!Files.isDirectory(proteinPath)) { + throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " is not a directory"); + } + + // Check UniProt file + List uniProtFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(UNIPROT_VERSION_FILENAME).toFile()), + proteinPath, PROTEIN_NAME + "/" + UNIPROT_NAME); + if (uniProtFiles.size() != 1) { + throw new CellBaseException("Only one " + UNIPROT_NAME + " file is expected, but currently there are " + uniProtFiles.size() + + " files"); } - // Check InterPro - if (interproFilePath != null && Files.exists(interproFilePath)) { - throw new CellBaseException("Could not build " + INTERPRO_NAME + ": file " + interproFilePath + " does not exist"); + // Check InterPro file + List interProFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(INTERPRO_VERSION_FILENAME).toFile()), + proteinPath, PROTEIN_NAME + "/" + INTERPRO_NAME); + if (interProFiles.size() != 1) { + throw new CellBaseException("Only one " + INTERPRO_NAME + " file is expected, but currently there are " + uniProtFiles.size() + + " files"); } + // Prepare UniProt data by splitting data in chunks + Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + logger.info("Split {} file {} into chunks at {}", UNIPROT_NAME, uniProtFiles.get(0).getName(), uniProtChunksPath); + Files.createDirectories(uniProtChunksPath); + splitUniprot(proteinPath.resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); + // Prepare RocksDB - RocksDB rocksDb = getDBConnection(); + RocksDB rocksDb = getDBConnection(uniProtChunksPath); ObjectMapper mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = mapper.writerFor(Entry.class); @@ -89,7 +104,7 @@ public void parse() throws CellBaseException { // Parsing files try { - File[] files = uniprotFilesDir.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); + File[] files = uniProtChunksPath.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); for (File file : files) { logger.info(PARSING_LOG_MESSAGE, file); @@ -108,8 +123,8 @@ public void parse() throws CellBaseException { } logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); - logger.info(PARSING_LOG_MESSAGE, interproFilePath); - try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath)) { + logger.info(PARSING_LOG_MESSAGE, interProFiles.get(0)); + try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interProFiles.get(0).toPath())) { Set hashSet = new HashSet<>(proteinMap.keySet()); Set visited = new HashSet<>(30000); @@ -171,9 +186,9 @@ public void parse() throws CellBaseException { numUniqueProteinsProcessed); } } - logger.info(PARSING_DONE_LOG_MESSAGE, interproFilePath); + logger.info(PARSING_DONE_LOG_MESSAGE, interProFiles.get(0)); } catch (IOException e) { - throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interproFilePath, e); + throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interProFiles.get(0), e); } // Serialize and save results @@ -191,15 +206,63 @@ public void parse() throws CellBaseException { logger.info(BUILDING_DONE_LOG_MESSAGE, PROTEIN_NAME); } - private RocksDB getDBConnection() throws CellBaseException { + private RocksDB getDBConnection(Path uniProtChunksPath) throws CellBaseException { // A static method that loads the RocksDB C++ library RocksDB.loadLibrary(); // The Options class contains a set of configurable DB options that determines the behavior of a database Options options = new Options().setCreateIfMissing(true); try { - return RocksDB.open(options, uniprotFilesDir.resolve("integration.idx").toString()); + return RocksDB.open(options, uniProtChunksPath.resolve("integration.idx").toString()); } catch (RocksDBException e) { throw new CellBaseException("Error preparing RocksDB", e); } } + + private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + pw.print(""); + pw.close(); + } finally { + if (pw != null) { + pw.close(); + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 50255a3557..5cb8a4c1f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -18,14 +18,12 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.commons.utils.FileUtils; -import java.io.BufferedReader; import java.io.IOException; -import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -46,12 +44,13 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws CellBaseException if there is an error in the CelllBase configuration file */ public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, PROTEIN_NAME); if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { - return null; + logger.info("{} not supported for the species {}", PROTEIN_NAME, speciesConfiguration.getScientificName()); + return Collections.emptyList(); } Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); Files.createDirectories(proteinFolder); - logger.info("Downloading {} information at {} ...", PROTEIN_NAME, proteinFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); @@ -59,14 +58,9 @@ public List download() throws IOException, InterruptedException, C // Uniprot downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_NAME, PROTEIN_DATA, UNIPROT_VERSION_FILENAME, proteinFolder); - Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); - logger.info("Split UniProt file {} into chunks at {}", uniprotFilename, chunksPath); - Files.createDirectories(chunksPath); - splitUniprot(proteinFolder.resolve(uniprotFilename), chunksPath); downloadFiles.add(downloadFile); - // Interpro + // InterPro downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_NAME, PROTEIN_DATA, INTERPRO_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); @@ -76,48 +70,8 @@ public List download() throws IOException, InterruptedException, C INTACT_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); - return downloadFiles; - } - - private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } - } - pw.print(""); - pw.close(); - br.close(); + return downloadFiles; } } From 909c0b2fbd1a1ab522489994044a49754ffe8ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 12:59:30 +0200 Subject: [PATCH 042/148] core: fix regulation URLs in the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 5022340bec..a2330cd00c 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -60,9 +60,9 @@ download: GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" - REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_capital_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" - MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz" - MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz.tbi" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 From 71d8056e0ef6b61a0534a10b6cdd25f620d4b054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 13:01:54 +0200 Subject: [PATCH 043/148] lib: launch a CellBase exception if executing a command (wget, gunzip,...) fails; and improve log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 26 +++++++++++++------ .../lib/download/AbstractDownloadManager.java | 4 +-- .../lib/download/GenomeDownloadManager.java | 5 ++-- .../lib/download/PharmGKBDownloadManager.java | 2 +- .../lib/download/PubMedDownloadManager.java | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f0fbcd1702..3faea3b305 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -382,7 +382,7 @@ public class EtlCommons { public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) - throws IOException, InterruptedException { + throws IOException, InterruptedException, CellBaseException { // This small hack allow to configure the appropriate Logger level from the command line, this is done // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created. // org.apache.log4j.Logger rootLogger = LogManager.getRootLogger(); @@ -395,18 +395,28 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.info("Executing command: " + StringUtils.join(builder.command(), " ")); + logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); // Check process output - boolean executedWithoutErrors = true; - int genomeInfoExitValue = process.exitValue(); - if (genomeInfoExitValue != 0) { - logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); - executedWithoutErrors = false; + if (process.exitValue() != 0) { + String msg = "Error executing command '" + binPath + "'; error code = " + process.exitValue() + ". More info in log file: " + + logFilePath; + logger.error(msg); + throw new CellBaseException(msg); } - return executedWithoutErrors; + + return true; +// +// +// boolean executedWithoutErrors = true; +// int genomeInfoExitValue = process.exitValue(); +// if (genomeInfoExitValue != 0) { +// logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); +// executedWithoutErrors = false; +// } +// return executedWithoutErrors; } private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 7cf171e7dd..193f2e146d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -259,12 +259,12 @@ protected String getPhylo(SpeciesConfiguration sp) { } } - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException { + protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException, CellBaseException { return downloadFile(url, outputFileName, null); } protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) - throws IOException, InterruptedException { + throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); Long startTime = System.currentTimeMillis(); if (Paths.get(outputFileName).toFile().exists()) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index f36f493e1f..210d5bc39f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -67,8 +67,9 @@ public List downloadReferenceGenome() throws IOException, Interrup * @return list of files downloaded * @throws IOException if there is an error writing to a file * @throws InterruptedException if there is an error downloading files + * @throws CellBaseException if there is an error executing the command line */ - public List downloadConservation() throws IOException, InterruptedException { + public List downloadConservation() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, CONSERVATION_DATA)) { return Collections.emptyList(); } @@ -138,7 +139,7 @@ public List downloadConservation() throws IOException, Interrupted return downloadFiles; } - public List downloadRepeats() throws IOException, InterruptedException { + public List downloadRepeats() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, REPEATS_DATA)) { return Collections.emptyList(); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 04e72d3247..f52c3f8a23 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -42,7 +42,7 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); Files.createDirectories(pharmgkbDownloadFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index e5a8c78f26..87e4ec8b98 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -35,7 +35,7 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY); Files.createDirectories(pubmedFolder); logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder); From 15448249e65f9017d9746e2d8acdf575952f2164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 13:25:15 +0200 Subject: [PATCH 044/148] lib: fix sonnar issues, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 88 ++++++++----------- 1 file changed, 35 insertions(+), 53 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3faea3b305..fec7904b80 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -37,7 +37,7 @@ /** * Created by fjlopez on 03/06/16. */ -public class EtlCommons { +public final class EtlCommons { // Ensembl public static final String ENSEMBL_NAME = "Ensembl"; @@ -159,8 +159,8 @@ public class EtlCommons { // PharmGKB public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; + public static final String PHARMGKB_SUBDIRECTORY = PHARMGKB_DATA; + public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PHARMGKB_GENES_FILE_ID = "GENES"; public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; @@ -212,6 +212,9 @@ public class EtlCommons { public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY; + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ @Deprecated public static final String REPEATS_JSON = "repeats"; // Simple repeats @@ -290,15 +293,6 @@ public class EtlCommons { public static final String CADD_DATA = "cadd"; public static final String PPI_DATA = "ppi"; public static final String DRUG_DATA = "drug"; -// public static final String CLINVAR_DATA = "clinvar"; -// public static final String DOCM_DATA = "docm"; -// public static final String COSMIC_DATA = "cosmic"; -// public static final String GWAS_DATA = "gwas"; -// public static final String IARCTP53_GERMLINE_FILE = "germlineMutationDataIARC TP53 Database, R20.txt"; -// public static final String IARCTP53_GERMLINE_REFERENCES_FILE = "germlineMutationReferenceIARC TP53 Database, R20.txt"; -// public static final String IARCTP53_SOMATIC_FILE = "somaticMutationDataIARC TP53 Database, R20.txt"; -// public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt"; -// public static final String HGMD_DATA = "hgmd"; // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; @@ -348,23 +342,18 @@ public class EtlCommons { // Splice scores public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; - public static final String MMSPLICE_VERSION_FILENAME = "mmsplice" + SUFFIX_VERSION_FILENAME; + public static final String MMSPLICE_VERSION_FILENAME = MMSPLICE_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; public static final String SPLICEAI_SUBDIRECTORY = "spliceai"; - public static final String SPLICEAI_VERSION_FILENAME = "spliceai" + SUFFIX_VERSION_FILENAME; + public static final String SPLICEAI_VERSION_FILENAME = SPLICEAI_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; - // binary bigwig file + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ @Deprecated public static final String GERP_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bw"; - // bigwig file manually transformed to bedGraph file - public static final String GERP_PROCESSED_FILE = "gerp.bedGraph.gz"; //"gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz"; public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz"; public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz"; - public static final String DOCM_FILE = "docm.json.gz"; public static final String DOCM_NAME = "DOCM"; - public static final String STRUCTURAL_VARIANTS_FOLDER = "structuralVariants"; - public static final String DGV_FILE = "dgv.txt"; - public static final String DGV_VERSION_FILE = "dgvVersion.json"; - public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants"; public static final String OBO_JSON = "ontology"; public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; @@ -377,17 +366,16 @@ public class EtlCommons { // PubMed public static final String PUBMED_NAME = "PubMed"; public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_SUBDIRECTORY = "pubmed"; - public static final String PUBMED_VERSION_FILENAME = "pubmed" + SUFFIX_VERSION_FILENAME; + public static final String PUBMED_SUBDIRECTORY = PUBMED_DATA; + public static final String PUBMED_VERSION_FILENAME = PUBMED_DATA + SUFFIX_VERSION_FILENAME; public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; + private EtlCommons() { + throw new IllegalStateException("Utility class"); + } + public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException, CellBaseException { - // This small hack allow to configure the appropriate Logger level from the command line, this is done - // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created. -// org.apache.log4j.Logger rootLogger = LogManager.getRootLogger(); -// ConsoleAppender stderr = (ConsoleAppender) rootLogger.getAppender("stdout"); -// stderr.setThreshold(Level.toLevel("debug")); Configurator.setRootLevel(Level.INFO); @@ -395,7 +383,9 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); + if (logger.isDebugEnabled()) { + logger.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); + } Process process = builder.start(); process.waitFor(); @@ -408,15 +398,6 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat } return true; -// -// -// boolean executedWithoutErrors = true; -// int genomeInfoExitValue = process.exitValue(); -// if (genomeInfoExitValue != 0) { -// logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); -// executedWithoutErrors = false; -// } -// return executedWithoutErrors; } private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { @@ -466,24 +447,23 @@ public static Long countFileLines(Path filePath) throws IOException { public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, String ensemblRelease, String fileId, String species, String assembly, String chromosome) throws CellBaseException { if (!props.getUrl().getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.EnsemblProperties within the CellBase" - + " configuration file"); + throw new CellBaseException(getMissingFileIdMessage(fileId)); } String url = props.getUrl().getHost() + props.getUrl().getFiles().get(fileId); // Change release, species, assembly, chromosome if necessary if (StringUtils.isNotEmpty(ensemblRelease)) { - url = url.replaceAll(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); + url = url.replace(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); } if (StringUtils.isNotEmpty(species)) { - url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); - url = url.replaceAll(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); + url = url.replace(PUT_SPECIES_HERE_MARK, species); + url = url.replace(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); } if (StringUtils.isNotEmpty(assembly)) { - url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); } if (StringUtils.isNotEmpty(chromosome)) { - url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); } return url; } @@ -495,8 +475,7 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI public static String getUrl(DownloadProperties.URLProperties props, String fileId, String species, String assembly, String chromosome) throws CellBaseException { if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); + throw new CellBaseException(getMissingFileIdMessage(fileId)); } String url; String filesValue = props.getFiles().get(fileId); @@ -506,13 +485,13 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI url = props.getHost() + filesValue; } if (StringUtils.isNotEmpty(species)) { - url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + url = url.replace(PUT_SPECIES_HERE_MARK, species); } if (StringUtils.isNotEmpty(assembly)) { - url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); } if (StringUtils.isNotEmpty(chromosome)) { - url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); } return url; } @@ -542,8 +521,7 @@ public static boolean isExecutableAvailable(String executable) throws IOExceptio public static String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); + throw new CellBaseException(getMissingFileIdMessage(fileId)); } return getFilenameFromUrl(props.getFiles().get(fileId)); } @@ -551,4 +529,8 @@ public static String getFilenameFromProps(DownloadProperties.URLProperties props public static String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } + + private static String getMissingFileIdMessage(String fileId) { + return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; + } } From 3e438746890db156393ca5b1f98c44a53510bcc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 18:03:53 +0200 Subject: [PATCH 045/148] lib: move the function to parse and build PFMs from the regulation downloader to the regulation builder; and improve regulation builder by adding checks, log messages and fixing sonnar issues, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 18 ++- .../org/opencb/cellbase/lib/EtlCommons.java | 18 ++- .../builders/RegulatoryFeatureBuilder.java | 124 +++++++++++++++--- .../download/RegulationDownloadManager.java | 50 +------ .../RegulatoryFeatureBuilderTest.java | 2 +- 5 files changed, 137 insertions(+), 75 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 3d3b9d9d37..718595bfe5 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -210,6 +210,9 @@ private CellBaseBuilder buildObo() { return new OntologyBuilder(oboDir, serializer); } + /** + * @deprecated (when using the new copyVersionFiles) + */ @Deprecated private void copyVersionFiles(List pathList) { for (Path path : pathList) { @@ -268,11 +271,16 @@ private CellBaseBuilder buildRevel() { return new RevelScoreBuilder(missensePredictionScorePath, serializer); } - private CellBaseBuilder buildRegulation() { - Path regulatoryRegionFilesDir = downloadFolder.resolve("regulation"); - copyVersionFiles(Collections.singletonList(regulatoryRegionFilesDir.resolve("ensemblRegulationVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_region"); - return new RegulatoryFeatureBuilder(regulatoryRegionFilesDir, serializer); + private CellBaseBuilder buildRegulation() throws CellBaseException { + // Sanity check + Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); + Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_VERSION_FILENAME), + regulationDownloadPath.resolve(MOTIF_FEATURES_VERSION_FILENAME)), regulationBuildPath); + + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); + return new RegulatoryFeatureBuilder(regulationDownloadPath, serializer); } private CellBaseBuilder buildProtein() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index fec7904b80..707ecf2714 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -29,6 +29,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -267,8 +268,11 @@ public final class EtlCommons { public static final String CADD_FILE_ID = "CADD"; // Regulation + public static final String REGULATION_NAME = "Regulation"; public static final String REGULATION_DATA = "regulation"; - public static final String REGULATION_SUBDIRECTORY = "regulation"; + public static final String REGULATION_SUBDIRECTORY = REGULATION_DATA; + public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; + public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) public static final String REGULATORY_BUILD_NAME = "Regulatory Build"; public static final String REGULATORY_BUILD_VERSION_FILENAME = "regulatoryBuild" + SUFFIX_VERSION_FILENAME; @@ -530,6 +534,18 @@ public static String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } + public static void checkDirectory(Path path, String name) throws CellBaseException { + if (path == null) { + throw new CellBaseException(name + " directory is null"); + } + if (!Files.exists(path)) { + throw new CellBaseException(name + " directory " + path + " does not exist"); + } + if (!Files.isDirectory(path)) { + throw new CellBaseException(name + " directory " + path + " is not a directory"); + } + } + private static String getMissingFileIdMessage(String fileId) { return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index d1ae5fb205..c8067661dc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -16,63 +16,149 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.feature.gff.Gff2; import org.opencb.biodata.formats.feature.gff.io.Gff2Reader; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.core.RegulatoryFeature; +import org.opencb.biodata.models.core.RegulatoryPfm; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import java.io.File; import java.io.IOException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; +import java.util.List; import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class RegulatoryFeatureBuilder extends CellBaseBuilder { - private final Path gffFile; - protected Set regulatoryFeatureSet; + private Path regulationPath; + + private Set regulatoryFeatureSet; - public RegulatoryFeatureBuilder(Path regulatoryDirectoryPath, CellBaseSerializer serializer) { + public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializer) { super(serializer); - // TODO: fix it ! - gffFile = null; -// gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); + this.regulationPath = regulationPath; } @Override public void parse() throws Exception { - logger.info("Parsing regulatory features..."); - if (Files.exists(gffFile)) { - parseGffFile(gffFile); - } else { - // TODO: fix it -// logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); - logger.warn("Skipping regulatory features GFF file parsing. Regulatory feature data models will not be built."); + logger.info(BUILDING_LOG_MESSAGE, REGULATION_NAME); + + // Sanity check + checkDirectory(regulationPath, REGULATION_NAME); + + // Check build regulatory files + List regulatoryFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_VERSION_FILENAME) + .toFile()), regulationPath, REGULATION_NAME + "/" + REGULATORY_BUILD_NAME); + if (regulatoryFiles.size() != 1) { + throw new CellBaseException("One " + REGULATORY_BUILD_NAME + " file is expected, but currently there are " + + regulatoryFiles.size() + " files"); } + + // Check motif features files + List motifFeaturesFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_VERSION_FILENAME) + .toFile()), regulationPath, REGULATION_NAME + "/" + MOTIF_FEATURES_NAME); + if (motifFeaturesFiles.size() != 2) { + throw new CellBaseException("Two " + MOTIF_FEATURES_NAME + " files are expected, but currently there are " + + motifFeaturesFiles.size() + " files"); + } + + // Downloading and building pfm matrices + File motifFile = motifFeaturesFiles.get(0).getName().endsWith("tbi") ? motifFeaturesFiles.get(1) : motifFeaturesFiles.get(0); + loadPfmMatrices(motifFile.toPath(), serializer.getOutdir()); + + // Parse regulatory build features + parseGffFile(regulatoryFiles.get(0).toPath()); + + logger.info(BUILDING_DONE_LOG_MESSAGE, REGULATION_NAME); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { + logger.info(PARSING_LOG_MESSAGE, regulatoryFeatureFile); + + // Create and populate regulatory feature set regulatoryFeatureSet = new HashSet<>(); - if (regulatoryFeatureFile != null && Files.exists(regulatoryFeatureFile) && !Files.isDirectory(regulatoryFeatureFile) - && Files.size(regulatoryFeatureFile) > 0) { - Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile); + try (Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile)) { Gff2 feature; while ((feature = regulatoryFeatureReader.read()) != null) { regulatoryFeatureSet.add(feature); } - regulatoryFeatureReader.close(); } - int i = 0; // Serialize and save results for (Gff2 feature : regulatoryFeatureSet) { - // ID=TF_binding_site:ENSR00000243312; + // In order to get the ID we split the attribute format: ID=TF_binding_site:ENSR00000243312; .... String id = feature.getAttribute().split(";")[0].split(":")[1]; RegulatoryFeature regulatoryFeature = new RegulatoryFeature(id, feature.getSequenceName(), feature.getFeature(), feature.getStart(), feature.getEnd()); serializer.serialize(regulatoryFeature); } serializer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, regulatoryFeatureFile); + } + + private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOException, NoSuchMethodException, FileFormatException, + InterruptedException { + Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_BASENAME + ".json.gz"); + logger.info("Downloading and building PFM matrices in {} from {} ...", regulatoryPfmPath, motifGffFile); + if (Files.exists(regulatoryPfmPath)) { + logger.info("{} is already built", regulatoryPfmPath); + return; + } + + Set motifIds = new HashSet<>(); + try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { + Gff2 tfbsMotifFeature; + Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); + while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { + String pfmId = getMatrixId(filePattern, tfbsMotifFeature); + if (StringUtils.isNotEmpty(pfmId)) { + motifIds.add(pfmId); + } + } + } + + ObjectMapper mapper = new ObjectMapper(); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, REGULATORY_PFM_BASENAME, true); + if (logger.isInfoEnabled()) { + logger.info("Looking up {} PFMs", motifIds.size()); + } + for (String pfmId : motifIds) { + String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId + + "?unit=frequencies;content-type=application/json"; + URL url = new URL(urlString); + RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); + serializer.serialize(regulatoryPfm); + // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits + TimeUnit.MILLISECONDS.sleep(250); + } + serializer.close(); + + logger.info("Downloading and building PFM matrices at {} done.", regulatoryPfmPath); + } + + private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { + Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); + if (matcher.find()) { + return matcher.group(0); + } + return null; + } + + public Set getRegulatoryFeatureSet() { + return regulatoryFeatureSet; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index d11e907aa0..56d15bf844 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -89,60 +89,12 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // Save data source (name, category, version,...) - saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), urls, + saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(" + ENSEMBL_NAME + " " + ensemblVersion + ")", getTimeStamp(), urls, regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); - // TODO: This will be executed in the CellBase build -// loadPfmMatrices(); - return downloadFiles; } -// private void loadPfmMatrices() -// throws IOException, NoSuchMethodException, FileFormatException, InterruptedException, CellBaseException { -// logger.info("Downloading and building pfm matrices..."); -// if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { -// logger.info("regulatory_pfm.json.gz is already built"); -// return; -// } -// Set motifIds = new HashSet<>(); -// Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); -// try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { -// Gff2 tfbsMotifFeature; -// Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); -// while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { -// String pfmId = getMatrixId(filePattern, tfbsMotifFeature); -// if (StringUtils.isNotEmpty(pfmId)) { -// motifIds.add(pfmId); -// } -// } -// } -// -// ObjectMapper mapper = new ObjectMapper(); -// CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); -// if (logger.isInfoEnabled()) { -// logger.info("Looking up {} pfms", motifIds.size()); -// } -// for (String pfmId : motifIds) { -// String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId -// + "?unit=frequencies;content-type=application/json"; -// URL url = new URL(urlString); -// RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); -// serializer.serialize(regulatoryPfm); -// // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits -// TimeUnit.MILLISECONDS.sleep(250); -// } -// serializer.close(); -// } -// -// private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { -// Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); -// if (matcher.find()) { -// return matcher.group(0); -// } -// return null; -// } - private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading {} ...", MIRBASE_NAME); return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_NAME, REGULATION_DATA, diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java index 1bd36998b6..cde955fb63 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java @@ -33,7 +33,7 @@ public void testParse() throws Exception { CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "regulatory_feature", true); RegulatoryFeatureBuilder parser = new RegulatoryFeatureBuilder(regulationDirectoryPath, serializer); parser.parse(); - Set features = parser.regulatoryFeatureSet; + Set features = parser.getRegulatoryFeatureSet(); assertEquals(1, features.size()); Gff2 feature = features.iterator().next(); From 959e42365fe80db418377e140fdbe464eb233f52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 12:11:06 +0200 Subject: [PATCH 046/148] core: update ontology section of the CellBase configuration since ontology versions will be taken from the OBO files content, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index a2330cd00c..af817b1844 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -247,22 +247,22 @@ download: ## OBO Ontologies hpoObo: host: http://purl.obolibrary.org/obo/ - version: "2024-03-01" + ## The version is retrieved from the OBO file files: HPO: hp.obo goObo: - host: http://purl.obolibrary.org/obo/go/ - version: "2024-03-01" + host: http://purl.obolibrary.org/obo/ + ## The version is retrieved from the OBO file files: GO: go/go-basic.obo doidObo: host: http://purl.obolibrary.org/obo/ - version: "2024-03-01" + ## The version is retrieved from the OBO file files: DOID: doid.obo mondoObo: host: http://purl.obolibrary.org/obo/ - version: "2024-03-01" + ## The version is retrieved from the OBO file files: MONDO: mondo.obo From 158c259a25fdd3f898401a6eea551d52e2c14180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 12:12:07 +0200 Subject: [PATCH 047/148] lib: update ontology download since ontology versions will be taken from the OBO files content; and improve log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 3 +- .../lib/download/OntologyDownloadManager.java | 50 +++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 707ecf2714..fafd01a0f3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -232,8 +232,9 @@ public final class EtlCommons { public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; // Ontology + public static final String ONTOLOGY_NAME = "Ontology"; public static final String ONTOLOGY_DATA = "ontology"; - public static final String ONTOLOGY_SUBDIRECTORY = "ontology"; + public static final String ONTOLOGY_SUBDIRECTORY = ONTOLOGY_DATA; // HPO public static final String HPO_OBO_NAME = "HPO"; public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index b09cf76f2f..4a91d84225 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -18,17 +18,22 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.commons.utils.FileUtils; +import java.io.BufferedReader; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; public class OntologyDownloadManager extends AbstractDownloadManager { + private static final String DATA_VERSION_FIELD = "data-version:"; + public OntologyDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -37,31 +42,58 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec public List download() throws IOException, InterruptedException, CellBaseException { Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); Files.createDirectories(oboFolder); - logger.info("Downloading {} files {} ...", ONTOLOGY_DATA, oboFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, ONTOLOGY_NAME); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // HPO - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_NAME, ONTOLOGY_DATA, - HPO_OBO_FILE_ID, HPO_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); + String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(HPO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(HPO_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); // GO - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoObo(), GO_OBO_NAME, ONTOLOGY_DATA, - GO_OBO_FILE_ID, GO_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(GO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(GO_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); // DOID - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_NAME, ONTOLOGY_DATA, - DOID_OBO_FILE_ID, DOID_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(DOID_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(DOID_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); // Mondo - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_NAME, ONTOLOGY_DATA, - MONDO_OBO_FILE_ID, MONDO_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(MONDO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(MONDO_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); + return downloadFiles; } + + private String getVersionFromOboFile(Path oboPath) throws CellBaseException, IOException { + String version = null; + if (!oboPath.toFile().exists()) { + throw new CellBaseException("OBO file " + oboPath + " does not exit"); + } + try (BufferedReader reader = FileUtils.newBufferedReader(oboPath)) { + String line; + while ((line = reader.readLine()) != null) { + if (line.startsWith(DATA_VERSION_FIELD)) { + version = line.split(DATA_VERSION_FIELD)[1].trim(); + break; + } + } + } + return version; + } } From 0b83831ce59e8d98f553af7adadc317b6cc1830e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 13:21:38 +0200 Subject: [PATCH 048/148] app: update the build command executor to check/copy the ontology version files before creating the ontology builder, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 718595bfe5..42392f523b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -204,10 +204,18 @@ private CellBaseBuilder buildRepeats() throws CellBaseException { return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } - private CellBaseBuilder buildObo() { - Path oboDir = downloadFolder.resolve(ONTOLOGY_DATA); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.OBO_JSON); - return new OntologyBuilder(oboDir, serializer); + private CellBaseBuilder buildObo() throws CellBaseException { + Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); + Path oboBuildPath = buildFolder.resolve(ONTOLOGY_SUBDIRECTORY); + List versionPaths = Arrays.asList(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), + oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), + oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), + oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME)); + copyVersionFiles(versionPaths, oboBuildPath); + + // Create serializer and return the ontology builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(oboBuildPath, OBO_BASENAME); + return new OntologyBuilder(oboDownloadPath, serializer); } /** From 39f0f4148000cfc881fb6ac7cf9732688d0582cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 13:23:18 +0200 Subject: [PATCH 049/148] lib: improve the ontology builder by removing hardcoded filenames, adding log messages and refactoring code, #TASK-5576, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 3 +- .../lib/builders/OntologyBuilder.java | 79 +++++++++++-------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index fafd01a0f3..83b6d2d562 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -235,6 +235,7 @@ public final class EtlCommons { public static final String ONTOLOGY_NAME = "Ontology"; public static final String ONTOLOGY_DATA = "ontology"; public static final String ONTOLOGY_SUBDIRECTORY = ONTOLOGY_DATA; + public static final String OBO_BASENAME = "ontology"; // HPO public static final String HPO_OBO_NAME = "HPO"; public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; @@ -359,8 +360,6 @@ public final class EtlCommons { public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz"; public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz"; public static final String DOCM_NAME = "DOCM"; - - public static final String OBO_JSON = "ontology"; public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME; public static final String DO_VERSION_FILE = "do" + SUFFIX_VERSION_FILENAME; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index cbe7c56952..679e0d30f8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -19,60 +19,69 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class OntologyBuilder extends CellBaseBuilder { - private Path hpoFile; - private Path goFile; - private Path doidFile; - private Path mondoFile; + private Path oboDownloadPath; - public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { + public OntologyBuilder(Path oboDownloadPath, CellBaseSerializer serializer) { super(serializer); - // TODO: fix it !! -// hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); -// goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); -// doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); -// mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); + this.oboDownloadPath = oboDownloadPath; } @Override public void parse() throws Exception { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFile); - OboParser parser = new OboParser(); - List terms = parser.parseOBO(bufferedReader, "Human Phenotype Ontology"); - for (OntologyTerm term : terms) { - term.setSource("HP"); - serializer.serialize(term); - } + logger.info(BUILDING_LOG_MESSAGE, ONTOLOGY_NAME); - bufferedReader = FileUtils.newBufferedReader(goFile); - terms = parser.parseOBO(bufferedReader, "Gene Ontology"); - for (OntologyTerm term : terms) { - term.setSource("GO"); - serializer.serialize(term); - } + // Sanity check + checkDirectory(oboDownloadPath, REGULATION_NAME); - bufferedReader = FileUtils.newBufferedReader(doidFile); - terms = parser.parseOBO(bufferedReader, "Human Disease Ontology"); - for (OntologyTerm term : terms) { - term.setSource("DOID"); - serializer.serialize(term); - } + // Check ontology files + List hpoFiles = checkOboFiles(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), HPO_OBO_NAME); + List goFiles = checkOboFiles(oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), GO_OBO_NAME); + List doidFiles = checkOboFiles(oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), DOID_OBO_NAME); + List mondoFiles = checkOboFiles(oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME), MONDO_OBO_NAME); - bufferedReader = FileUtils.newBufferedReader(mondoFile); - terms = parser.parseOBO(bufferedReader, "Mondo Ontology"); - for (OntologyTerm term : terms) { - term.setSource("MONDO"); - serializer.serialize(term); - } + // Parse OBO files and build + parseOboFile(hpoFiles.get(0), HPO_OBO_NAME); + parseOboFile(goFiles.get(0), GO_OBO_NAME); + parseOboFile(doidFiles.get(0), DOID_OBO_NAME); + parseOboFile(mondoFiles.get(0), MONDO_OBO_NAME); + // Close serializer serializer.close(); + + logger.info(BUILDING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); + } + + private void parseOboFile(File oboFile, String name) throws IOException { + logger.info(PARSING_LOG_MESSAGE, oboFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(oboFile.toPath())) { + OboParser parser = new OboParser(); + List terms = parser.parseOBO(bufferedReader, name); + for (OntologyTerm term : terms) { + serializer.serialize(term); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, oboFile); + } + + private List checkOboFiles(Path versionFilePath, String name) throws IOException, CellBaseException { + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, ONTOLOGY_NAME + "/" + name); + if (files.size() != 1) { + throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); + } + return files; } } From 5c3dae0c6ddd23345f1e03bf9ec0873a833d0788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 16:28:58 +0200 Subject: [PATCH 050/148] lib: improve the PharmGKB downloader by moving the function to unzip PharmGKB files from the downloader to the PharmGKB builder and adding log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/PharmGKBDownloadManager.java | 44 +++++-------------- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 83b6d2d562..56ad6dae8b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -155,6 +155,7 @@ public final class EtlCommons { public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics + public static final String PHARMACOGENOMICS_NAME = "Pharmacogenomics"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; public static final String PHARMACOGENOMICS_SUBDIRECTORY = "pharmacogenomics"; // PharmGKB diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index f52c3f8a23..873387f94b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -19,16 +19,11 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.commons.exec.Command; -import org.opencb.commons.utils.FileUtils; import java.io.IOException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,49 +38,32 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException, CellBaseException { - DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); + logger.info(DOWNLOADING_LOG_MESSAGE, PHARMACOGENOMICS_NAME); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); Files.createDirectories(pharmgkbDownloadFolder); - logger.info("Downloading {} files at {} ...", PHARMGKB_DATA, pharmgkbDownloadFolder); + + DownloadProperties.URLProperties pharmGKBProps = configuration.getDownload().getPharmGKB(); List urls = new ArrayList<>(); List downloadFiles = new ArrayList<>(); - String host = pharmGKB.getHost(); - for (Map.Entry entry : pharmGKB.getFiles().entrySet()) { + String host = pharmGKBProps.getHost(); + for (Map.Entry entry : pharmGKBProps.getFiles().entrySet()) { String url = host + entry.getValue(); urls.add(url); - Path downloadedFileName = Paths.get(new URL(url).getPath()).getFileName(); - Path downloadedFilePath = pharmgkbDownloadFolder.resolve(downloadedFileName); - logger.info("Downloading file {} to {}", url, downloadedFilePath); + Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); downloadFiles.add(downloadFile); - - // Unzip downloaded file - unzip(downloadedFilePath.getParent(), downloadedFileName.toString(), Collections.emptyList(), - pharmgkbDownloadFolder.resolve(downloadedFileName.toString().split("\\.")[0])); } // Save versions - saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_DATA, pharmGKB.getVersion(), getTimeStamp(), urls, + saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_NAME, pharmGKBProps.getVersion(), getTimeStamp(), urls, pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); - return downloadFiles; - } - - private void unzip(Path inPath, String zipFilename, List outFilenames, Path outPath) throws IOException { - // Check zip file exists - FileUtils.checkFile(inPath.resolve(zipFilename)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PHARMACOGENOMICS_NAME); - // Unzip files if output dir does NOT exist - if (!outPath.toFile().exists()) { - logger.info("Unzipping {} into {}", zipFilename, outPath); - Command cmd = new Command("unzip -d " + outPath + " " + inPath.resolve(zipFilename)); - cmd.run(); - // Check if expected files exist - for (String outFilename : outFilenames) { - FileUtils.checkFile(outPath.resolve(outFilename)); - } - } + return downloadFiles; } } From 971235e7bff2dbe1ce27efa6898ebf520757b8b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 18:47:04 +0200 Subject: [PATCH 051/148] lib: improve the PharmGKB builder by adding checks and log messages; and move the function to unzip PharmGKB files from the downloader to the builder, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 23 ++-- .../org/opencb/cellbase/lib/EtlCommons.java | 8 +- .../lib/builders/PharmGKBBuilder.java | 127 +++++++++++++----- 3 files changed, 104 insertions(+), 54 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 42392f523b..8225648820 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -409,22 +409,15 @@ private CellBaseBuilder buildPubMed() throws IOException { return new PubMedBuilder(pubmedInputFolder, serializer); } - private CellBaseBuilder buildPharmacogenomics() throws IOException { - Path inFolder = downloadFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - Path outFolder = buildFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - if (!outFolder.toFile().exists()) { - outFolder.toFile().mkdirs(); - } - - logger.info("Copying PharmGKB version file..."); - if (inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME).toFile().exists()) { - Files.copy(inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - outFolder.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); - } + private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { + // Sanity check + Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME)), pharmGkbBuildPath); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); - return new PharmGKBBuilder(inFolder, serializer); + // Create the file serializer and the PharmGKB feature builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); + return new PharmGKBBuilder(pharmGkbDownloadPath, serializer); } private void checkVersionFiles(List versionPaths) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 56ad6dae8b..8b56fc9d0f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -161,8 +161,8 @@ public final class EtlCommons { // PharmGKB public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_SUBDIRECTORY = PHARMGKB_DATA; - public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME; + public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; + public static final String PHARMGKB_VERSION_FILENAME = "pharmGKB" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PHARMGKB_GENES_FILE_ID = "GENES"; public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; @@ -396,8 +396,8 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat // Check process output if (process.exitValue() != 0) { - String msg = "Error executing command '" + binPath + "'; error code = " + process.exitValue() + ". More info in log file: " - + logFilePath; + String msg = "Error executing command '" + binPath + "'; args = " + args + ", error code = " + process.exitValue() + + ". More info in log file: " + logFilePath; logger.error(msg); throw new CellBaseException(msg); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 1f7a4836ca..0e6017fc01 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -23,13 +23,16 @@ import org.opencb.biodata.models.core.Xref; import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.pharma.guideline.BasicObject; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; @@ -37,8 +40,7 @@ public class PharmGKBBuilder extends CellBaseBuilder { - private final Path inputDir; - private final Path pharmGKBDir; + private final Path pharmGkbDownloadPath; private static final String CHEMICALS_BASENAME = "chemicals"; private static final String CHEMICALS_TSV_FILENAME = "chemicals.tsv"; @@ -88,21 +90,24 @@ public class PharmGKBBuilder extends CellBaseBuilder { private static final String PHARMGKB_LAST_UPDATE_DATE_KEY = "PHARMGKB_LAST_UPDATE_DATE"; private static final String PHARMGKB_IS_VIP_KEY = "PHARMGKB_IS_VIP"; - public PharmGKBBuilder(Path inputDir, CellBaseFileSerializer serializer) { + public PharmGKBBuilder(Path parmGkbDownloadPath, CellBaseFileSerializer serializer) { super(serializer); - - this.inputDir = inputDir; - this.pharmGKBDir = inputDir.resolve(PHARMGKB_DATA); + this.pharmGkbDownloadPath = parmGkbDownloadPath; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkDirectory(inputDir); + logger.info(BUILDING_LOG_MESSAGE, PHARMGKB_NAME); + + // Sanity check + checkDirectory(pharmGkbDownloadPath, PHARMGKB_NAME); - // PharmGKB - FileUtils.checkDirectory(pharmGKBDir); - logger.info("Parsing {} files and building the data models...", PHARMGKB_NAME); + // Check PharmGKB files + List pharmGkbFiles = checkFiles(dataSourceReader.readValue(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME).toFile()), + pharmGkbDownloadPath, PHARMACOGENOMICS_NAME + "/" + PHARMGKB_NAME); + + // Unzip downloaded file + unzipDownloadedFiles(pharmGkbFiles); // Parse chemical file Map chemicalsMap = parseChemicalFile(); @@ -113,8 +118,6 @@ public void parse() throws Exception { // Parse gene file parseGeneFile(chemicalsMap); - logger.info("Parsing {} files finished.", PHARMGKB_NAME); - // Generation the pharmacogenomics JSON file logger.info("Writing {} JSON file to {} ...", PHARMACOGENOMICS_DATA, serializer.getOutdir()); int counter = 0; @@ -125,11 +128,14 @@ public void parse() throws Exception { } } serializer.close(); - logger.info("Writing {} JSON file done!", PHARMACOGENOMICS_DATA); + + logger.info(BUILDING_DONE_LOG_MESSAGE, PHARMGKB_NAME); } private Map parseChemicalFile() throws IOException { - Path chemicalsFile = pharmGKBDir.resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + Path chemicalsFile = serializer.getOutdir().resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, chemicalsFile); + Map chemicalsMap = new HashMap<>(); try (BufferedReader br = FileUtils.newBufferedReader(chemicalsFile)) { // Skip first line, i.e. the header line @@ -177,6 +183,7 @@ private Map parseChemicalFile() throws IOException { } logger.info("Number of Chemical items read {}", chemicalsMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, chemicalsFile); return chemicalsMap; } @@ -192,8 +199,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM Map> variantMap = parseVariantFile(); // clinical_annotations.tsv - try (BufferedReader br = FileUtils.newBufferedReader(pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME) - .resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME))) { + Path clinAnnotPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, clinAnnotPath); + try (BufferedReader br = FileUtils.newBufferedReader(clinAnnotPath)) { // Skip first line, i.e. the header line String line = br.readLine(); while ((line = br.readLine()) != null) { @@ -278,6 +286,7 @@ private void parseClinicalAnnotationFiles(Map chemicalsM } } } + logger.info(PARSING_DONE_LOG_MESSAGE, clinAnnotPath); // Update the clinical annotation map by parsing the clinical annotation evidences parseClinicalAnnotationEvidenceFile(variantAnnotationMap); @@ -300,7 +309,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM private Map> parseVariantFile() throws IOException { Map> variantMap = new HashMap<>(); // Parse the variant file (i.e., variants.tsv) - Path varPath = pharmGKBDir.resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + Path varPath = serializer.getOutdir().resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varPath); + try (BufferedReader br = FileUtils.newBufferedReader(varPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -367,6 +378,7 @@ private Map> parseVariantFile() throws IOException { } logger.info("Number of variants = {}", variantMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, varPath); return variantMap; } @@ -385,7 +397,8 @@ private void parseClinicalAnnotationEvidenceFile(Map variantAnnotationMap) throws IOException { // Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv) - Path allelesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + Path allelesPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, allelesPath); try (BufferedReader br = FileUtils.newBufferedReader(allelesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -502,12 +520,14 @@ private void parseClinicalAnnotationAlleleFile(Map variantAssociationMap) throws IOException { // For CellBase, variant association corresponds to PharmGKB variant annotation // Parse the variant annotation file (i.e., var_drug_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -562,6 +582,7 @@ private void parseVariantAnnotationFile(Map va } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private Map parseGuidelineAnnotationFiles() throws IOException { @@ -571,7 +592,7 @@ private Map parseGuidelineAnnotationFiles() t ObjectReader objectReader = mapper.readerFor(PharmaGuidelineAnnotation.class); // Parse the guideline annotations JSON files - Path guidelinesPath = pharmGKBDir.resolve(GUIDELINE_ANNOTATIONS_BASENAME); + Path guidelinesPath = serializer.getOutdir().resolve(GUIDELINE_ANNOTATIONS_BASENAME); FileUtils.checkDirectory(guidelinesPath); for (File file : Objects.requireNonNull(guidelinesPath.toFile().listFiles())) { if (file.getName().endsWith("json")) { @@ -593,7 +614,8 @@ private Map parseGuidelineAnnotationFiles() t private Map parseDrugLabelAnnotationFile() throws IOException { Map drugLabelAnnotationMap = new HashMap<>(); // Parse the drug labels annotations file (i.e., drugLabels.tsv) - Path drugLabelPath = pharmGKBDir.resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + Path drugLabelPath = serializer.getOutdir().resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, drugLabelPath); try (BufferedReader br = FileUtils.newBufferedReader(drugLabelPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -631,12 +653,15 @@ private Map parseDrugLabelAnnotationFile() th } logger.info("Number of drug label annotations = {}", drugLabelAnnotationMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, drugLabelPath); return drugLabelAnnotationMap; } private void parsePhenotypeAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_pheno_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); + int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -691,11 +716,13 @@ private void parsePhenotypeAnnotationFile(Map } } logger.info("Number of phenotype annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseFunctionalAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_fa_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -751,12 +778,14 @@ private void parseFunctionalAnnotationFile(Map } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseStudyParameterFile(Map variantAssociationMap) throws IOException { Map> studyParametersMap = new HashMap<>(); // Parse the study parameters file (i.e., study_parameters.tsv) - Path studyParamsPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + Path studyParamsPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, studyParamsPath); try (BufferedReader br = FileUtils.newBufferedReader(studyParamsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -807,6 +836,7 @@ private void parseStudyParameterFile(Map varia } } logger.info("Number of study parameters lines = {}", studyParametersMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, studyParamsPath); for (Map.Entry> entry : studyParametersMap.entrySet()) { if (variantAssociationMap.containsKey(entry.getKey())) { @@ -861,7 +891,8 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx // Parse the genes file (i.e., genes.tsv) Map geneAnnotationMapByPgkbGeneId = new HashMap<>(); - Path genesPath = pharmGKBDir.resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + Path genesPath = serializer.getOutdir().resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, genesPath); try (BufferedReader br = FileUtils.newBufferedReader(genesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -940,13 +971,15 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx } logger.info("Number of parsed genes = {}", geneAnnotationMapByPgkbGeneId.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, genesPath); } private void parseChemicalGeneRelationships(Map> pgkbGeneIdMapByChemicalName, Map geneAnnotationMapByPgkbGeneId) throws IOException { int counter = 0; // Parse the genes file (i.e., relationships.tsv) - Path relationshipsPath = pharmGKBDir.resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + Path relationshipsPath = serializer.getOutdir().resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, relationshipsPath); try (BufferedReader br = FileUtils.newBufferedReader(relationshipsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -986,6 +1019,7 @@ private void parseChemicalGeneRelationships(Map> pgkbGeneIdM } } logger.info("Number of parsed {}-{} relationships = {}", GENE_ENTITY, CHEMICAL_ENTITY, counter); + logger.info(PARSING_DONE_LOG_MESSAGE, relationshipsPath); } private List stringFieldToList(String field) { @@ -1011,6 +1045,29 @@ private boolean isHaplotype(String value) { } private List getHaplotypeList(String value) { - return Arrays.stream(value.split(",")).map(s -> s.trim()).collect(Collectors.toList()); + return Arrays.stream(value.split(",")).map(String::trim).collect(Collectors.toList()); + } + + private void unzipDownloadedFiles(List pharmGkbFiles) throws CellBaseException { + // Unzip + for (File pharmGgkFile : pharmGkbFiles) { + logger.info("Unzip file: {}", pharmGgkFile); + try { + String outPath = serializer.getOutdir().resolve(pharmGgkFile.getName().split("\\.")[0]).toString(); + List params = Arrays.asList("-d", outPath, "-o", pharmGgkFile.toString()); + EtlCommons.runCommandLineProcess(null, "unzip", params, Paths.get(outPath + ".log").toString()); + } catch (CellBaseException e) { + if (pharmGgkFile.getName().contains(GUIDELINE_ANNOTATIONS_BASENAME)) { + // It fails because of long filenames, so it does not raise any exception + logger.warn(e.getMessage()); + } + } catch (IOException e) { + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } + } } } From cd444b0a037c3754f22ad0d25c04fc1c6c7f8d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 19:05:22 +0200 Subject: [PATCH 052/148] lib: improve the PubMed downloader by adding log messages and fixing sonnar issues, #TASK-5775, #TASK-5564 --- .../executors/DownloadCommandExecutor.java | 1 + .../org/opencb/cellbase/lib/EtlCommons.java | 7 ++--- .../lib/download/PubMedDownloadManager.java | 26 ++++++++++++------- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 8a763ae3c9..8da49800df 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -112,6 +112,7 @@ public void execute() throws CellBaseException { Thread.currentThread().interrupt(); throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } catch (Exception e) { + e.printStackTrace(); throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 8b56fc9d0f..3733f6fb59 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -371,9 +371,10 @@ public final class EtlCommons { // PubMed public static final String PUBMED_NAME = "PubMed"; public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_SUBDIRECTORY = PUBMED_DATA; - public static final String PUBMED_VERSION_FILENAME = PUBMED_DATA + SUFFIX_VERSION_FILENAME; - public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; + public static final String PUBMED_SUBDIRECTORY = "pubmed"; + public static final String PUBMED_VERSION_FILENAME = "pubMed" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX"; private EtlCommons() { throw new IllegalStateException("Utility class"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 87e4ec8b98..106e3be709 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -27,6 +27,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class PubMedDownloadManager extends AbstractDownloadManager { public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -36,13 +38,14 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException, CellBaseException { - Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, PUBMED_NAME); + + Path pubmedFolder = downloadFolder.resolve(PUBMED_SUBDIRECTORY); Files.createDirectories(pubmedFolder); - logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder); // Downloads PubMed XML files - String url = configuration.getDownload().getPubmed().getHost(); - String regexp = configuration.getDownload().getPubmed().getFiles().get(EtlCommons.PUBMED_REGEX_FILE_ID); + String host = configuration.getDownload().getPubmed().getHost(); + String regexp = configuration.getDownload().getPubmed().getFiles().get(PUBMED_REGEX_FILE_ID); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); int start = Integer.parseInt(split[0]); @@ -51,13 +54,18 @@ public List download() throws IOException, InterruptedException, C List downloadFiles = new ArrayList<>(); for (int i = start; i <= end; i++) { - String filename = name[0] + String.format("%0" + padding + "d", i) + name[2]; - logger.info("\tDownloading from {} to {} ", url + "/" + filename, pubmedFolder.resolve(filename)); - downloadFiles.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); + String padString = "%0" + padding + "d"; + String filename = name[0] + String.format(padString, i) + name[2]; + String url = host + filename; + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, pubmedFolder.resolve(filename).toString())); } - saveDataSource(EtlCommons.PUBMED_NAME, EtlCommons.PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), - Collections.singletonList(url), pubmedFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); + // Save data source + saveDataSource(EtlCommons.PUBMED_NAME, PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), + Collections.singletonList(host), pubmedFolder.resolve(PUBMED_VERSION_FILENAME)); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PUBMED_NAME); return downloadFiles; } From e19fe73dd77a1225bd088c368c890bf63071ef67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 11:13:48 +0200 Subject: [PATCH 053/148] lib: create maps to get the names, categories and version filenames from a given data, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3733f6fb59..833767a1dc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -33,7 +33,9 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * Created by fjlopez on 03/06/16. @@ -369,13 +371,27 @@ public final class EtlCommons { public static final String HGMD_FILE = "hgmd.vcf"; // PubMed - public static final String PUBMED_NAME = "PubMed"; public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_SUBDIRECTORY = "pubmed"; - public static final String PUBMED_VERSION_FILENAME = "pubMed" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX"; + // Utilities maps + private static Map dataNamesMap = new HashMap<>(); + private static Map dataCategoriesMap = new HashMap<>(); + private static Map dataVersionFilenamesMap = new HashMap<>(); + + static { + + // Populate data names map + dataNamesMap.put(PUBMED_DATA, "PubMed"); + + // Populate data categories map + dataCategoriesMap.put(PUBMED_DATA, "Publication"); + + // Populate data version filenames Map + dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + } + private EtlCommons() { throw new IllegalStateException("Utility class"); } @@ -551,4 +567,25 @@ public static void checkDirectory(Path path, String name) throws CellBaseExcepti private static String getMissingFileIdMessage(String fileId) { return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; } + + public static String getDataName(String data) throws CellBaseException { + if (!dataNamesMap.containsKey(data)) { + throw new CellBaseException("Name not found for data " + data); + } + return dataNamesMap.get(data); + } + + public static String getDataCategory(String data) throws CellBaseException { + if (!dataCategoriesMap.containsKey(data)) { + throw new CellBaseException("Category not found for data " + data); + } + return dataCategoriesMap.get(data); + } + + public static String getDataVersionFilename(String data) throws CellBaseException { + if (!dataVersionFilenamesMap.containsKey(data)) { + throw new CellBaseException("Version filename not found for data " + data); + } + return dataVersionFilenamesMap.get(data); + } } From a29afe3ac11fb660ce9fa256b254cbe0df2953c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 11:15:10 +0200 Subject: [PATCH 054/148] lib: update according to the EtlCommons changes, #TASK-5775, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 16 ++++--- .../lib/download/AbstractDownloadManager.java | 17 +++++++- .../lib/download/PubMedDownloadManager.java | 43 +++++++++++-------- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index ca1a4a9a71..c750beb6aa 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -44,6 +44,8 @@ import java.util.List; import java.util.concurrent.ExecutionException; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; + /** * Created by imedina on 03/02/15. */ @@ -81,7 +83,7 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, - EtlCommons.PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -289,7 +291,7 @@ public void execute() throws CellBaseException { loadSpliceScores(); break; } - case EtlCommons.PUBMED_DATA: { + case PUBMED_DATA: { // Load data, create index and update release loadPubMed(); break; @@ -536,7 +538,7 @@ private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionEx } private void loadPubMed() throws CellBaseException { - Path pubmedPath = input.resolve(EtlCommons.PUBMED_DATA); + Path pubmedPath = input.resolve(PUBMED_DATA); if (Files.exists(pubmedPath)) { // Load data @@ -544,7 +546,7 @@ private void loadPubMed() throws CellBaseException { if (file.isFile() && (file.getName().endsWith("gz"))) { logger.info("Loading file '{}'", file.getName()); try { - loadRunner.load(file.toPath(), EtlCommons.PUBMED_DATA, dataRelease); + loadRunner.load(file.toPath(), PUBMED_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | LoaderException e) { logger.error("Error loading file '{}': {}", file.getName(), e.toString()); @@ -552,11 +554,11 @@ private void loadPubMed() throws CellBaseException { } } // Create index - createIndex(EtlCommons.PUBMED_DATA); + createIndex(PUBMED_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, EtlCommons.PUBMED_DATA, EtlCommons.PUBMED_DATA, sources); + List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.getDataVersionFilename(PUBMED_DATA))); + dataReleaseManager.update(dataRelease, PUBMED_DATA, PUBMED_DATA, sources); } else { logger.warn("PubMed folder {} not found", pubmedPath); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 193f2e146d..35fcc5a470 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -47,7 +47,7 @@ import java.time.LocalDateTime; import java.util.*; -import static org.opencb.cellbase.lib.EtlCommons.getFilenameFromUrl; +import static org.opencb.cellbase.lib.EtlCommons.*; public abstract class AbstractDownloadManager { @@ -201,6 +201,21 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope return downloadFile(url, outFile.toString()); } + protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) + throws IOException, CellBaseException { + String name = getDataName(data); + String category = getDataCategory(data); + DataSource dataSource = new DataSource(name, category, version, date, urls); + + if (StringUtils.isEmpty(version)) { + logger.warn("Version missing for data source {}/{}, using the date as version: {}", category, name, date); + dataSource.setVersion(date); + } + + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + } + + @Deprecated protected void saveDataSource(String name, String category, String version, String date, List urls, Path versionFilePath) throws IOException { DataSource dataSource = new DataSource(name, category, version, date, urls); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 106e3be709..6451fd76aa 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -17,8 +17,8 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -38,35 +38,44 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, PUBMED_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PUBMED_DATA)); - Path pubmedFolder = downloadFolder.resolve(PUBMED_SUBDIRECTORY); - Files.createDirectories(pubmedFolder); + Path pubmedDownloadFolder = downloadFolder.resolve(PUBMED_DATA); + Files.createDirectories(pubmedDownloadFolder); // Downloads PubMed XML files String host = configuration.getDownload().getPubmed().getHost(); - String regexp = configuration.getDownload().getPubmed().getFiles().get(PUBMED_REGEX_FILE_ID); + List filenames = getPubMedFilenames(configuration.getDownload().getPubmed()); + List downloadFiles = new ArrayList<>(); + for (String filename : filenames) { + String url = host + filename; + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedDownloadFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, pubmedDownloadFolder.resolve(filename).toString())); + } + + // Save data source + saveDataSource(PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), Collections.singletonList(host), + pubmedDownloadFolder.resolve(getDataVersionFilename(PUBMED_DATA))); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); + + return downloadFiles; + } + + public static List getPubMedFilenames(DownloadProperties.URLProperties pubMedProps) { + String regexp = pubMedProps.getFiles().get(PUBMED_REGEX_FILE_ID); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); int start = Integer.parseInt(split[0]); int end = Integer.parseInt(split[1]); int padding = Integer.parseInt(split[2]); - List downloadFiles = new ArrayList<>(); + List filenames = new ArrayList<>(); for (int i = start; i <= end; i++) { String padString = "%0" + padding + "d"; String filename = name[0] + String.format(padString, i) + name[2]; - String url = host + filename; - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedFolder.resolve(filename)); - downloadFiles.add(downloadFile(url, pubmedFolder.resolve(filename).toString())); + filenames.add(filename); } - - // Save data source - saveDataSource(EtlCommons.PUBMED_NAME, PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), - Collections.singletonList(host), pubmedFolder.resolve(PUBMED_VERSION_FILENAME)); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PUBMED_NAME); - - return downloadFiles; + return filenames; } } From 377ee9c8292cd1c619c7eff039d292be5e32dc53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 11:16:17 +0200 Subject: [PATCH 055/148] lib: improve PubMed builder by adding checks, log messages and fixing sonnar issues, #TAK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 23 +++--- .../cellbase/lib/builders/PubMedBuilder.java | 72 ++++++++++--------- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8225648820..620f1973b2 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -391,22 +391,15 @@ private CellBaseBuilder buildSplice() throws IOException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws IOException { - Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); - Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA); - if (!pubmedOutputFolder.toFile().exists()) { - pubmedOutputFolder.toFile().mkdirs(); - } - - logger.info("Copying PubMed version file..."); - if (pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME).toFile().exists()) { - Files.copy(pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - pubmedOutputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); - } + private CellBaseBuilder buildPubMed() throws IOException, CellBaseException { + // Sanity check + Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); + Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); + copyVersionFiles(Collections.singletonList(pubMedDownloadPath.resolve(getDataVersionFilename(PUBMED_DATA))), pubMedBuildPath); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubmedOutputFolder); - return new PubMedBuilder(pubmedInputFolder, serializer); + // Create the file serializer and the PubMed builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubMedBuildPath); + return new PubMedBuilder(pubMedDownloadPath, serializer, configuration); } private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java index 8aba7c9dda..348d22a07d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java @@ -16,63 +16,71 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.pubmed.PubMedParser; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticle; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticleSet; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.download.PubMedDownloadManager; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; -import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; +import static org.opencb.cellbase.lib.EtlCommons.getDataName; + public class PubMedBuilder extends CellBaseBuilder { - private Path pubmedDir; - private CellBaseFileSerializer fileSerializer; + private Path pubMedDownloadPath; + private CellBaseConfiguration configuration; - public PubMedBuilder(Path pubmedDir, CellBaseFileSerializer serializer) { + public PubMedBuilder(Path pubMedDownloadPath, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); - - this.fileSerializer = serializer; - this.pubmedDir = pubmedDir; - - logger = LoggerFactory.getLogger(PubMedBuilder.class); + this.pubMedDownloadPath = pubMedDownloadPath; + this.configuration = configuration; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkPath(pubmedDir); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PUBMED_DATA)); - logger.info("Parsing PubMed files..."); + // Check input folder + FileUtils.checkPath(pubMedDownloadPath); - for (File file : pubmedDir.toFile().listFiles()) { - if (file.isFile() && (file.getName().endsWith("gz") || file.getName().endsWith("xml"))) { - String name = file.getName().split("\\.")[0]; + // Check PubMed files before parsing them + List pubMedFilenames = PubMedDownloadManager.getPubMedFilenames(configuration.getDownload().getPubmed()); + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + if (!Files.exists(pubMedPath)) { + throw new CellBaseException("Expected PubMed file " + pubMedFilename + ", but it was not found at " + pubMedDownloadPath); + } + } + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + String basename = pubMedFilename.split("\\.")[0]; - ObjectWriter objectWriter = new ObjectMapper().writerFor(PubmedArticle.class); - PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(file.getAbsolutePath()); + PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(pubMedPath.toAbsolutePath().toString()); - List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); - logger.info("Parsing PubMed file {} of {} articles ...", file.getName(), objects.size()); - int counter = 0; - for (Object object : objects) { - PubmedArticle pubmedArticle = (PubmedArticle) object; - fileSerializer.serialize(pubmedArticle, name); - if (++counter % 2000 == 0) { - logger.info("\t\t" + counter + " articles"); - } + List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); + logger.info(PARSING_LOG_MESSAGE, pubMedPath); + int counter = 0; + for (Object object : objects) { + PubmedArticle pubmedArticle = (PubmedArticle) object; + ((CellBaseFileSerializer) serializer).serialize(pubmedArticle, basename); + if (++counter % 2000 == 0) { + logger.info("{} articles", counter); } - fileSerializer.close(); - logger.info("\t\tDone: " + counter + " articles."); } + serializer.close(); + + String logMsg = pubMedPath + " (" + counter + " articles)"; + logger.info(PARSING_DONE_LOG_MESSAGE, logMsg); } - logger.info("Parsing PubMed files finished."); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); } } From 997c8ec62d1e4fa3463a78a10483f9cddcaa668c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 15:38:44 +0200 Subject: [PATCH 056/148] lib: update CADD downloader according to last changes, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 15 +++++----- .../lib/download/AbstractDownloadManager.java | 25 +++++++++++++++- .../lib/download/CaddDownloadManager.java | 29 ++++++++++--------- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 833767a1dc..da209c66fc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -265,10 +265,8 @@ public final class EtlCommons { // Variation functional score public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY = "variation_functional_score"; // CADD scores - public static final String CADD_NAME = "CADD"; - public static final String CADD_VERSION_FILENAME = "cadd" + SUFFIX_VERSION_FILENAME; + public static final String CADD_DATA = "cadd"; // Must match the configuration file public static final String CADD_FILE_ID = "CADD"; @@ -299,7 +297,6 @@ public final class EtlCommons { public static final String GENOME_INFO_DATA = "genome_info"; public static final String DISGENET_DATA = "disgenet"; public static final String HPO_DATA = "hpo"; - public static final String CADD_DATA = "cadd"; public static final String PPI_DATA = "ppi"; public static final String DRUG_DATA = "drug"; @@ -384,12 +381,16 @@ public final class EtlCommons { // Populate data names map dataNamesMap.put(PUBMED_DATA, "PubMed"); + dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); + dataNamesMap.put(CADD_DATA, "CADD"); // Populate data categories map dataCategoriesMap.put(PUBMED_DATA, "Publication"); + dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { @@ -570,21 +571,21 @@ private static String getMissingFileIdMessage(String fileId) { public static String getDataName(String data) throws CellBaseException { if (!dataNamesMap.containsKey(data)) { - throw new CellBaseException("Name not found for data " + data); + throw new CellBaseException("Name not found for data '" + data + "'"); } return dataNamesMap.get(data); } public static String getDataCategory(String data) throws CellBaseException { if (!dataCategoriesMap.containsKey(data)) { - throw new CellBaseException("Category not found for data " + data); + throw new CellBaseException("Category not found for data '" + data + "'"); } return dataCategoriesMap.get(data); } public static String getDataVersionFilename(String data) throws CellBaseException { if (!dataVersionFilenamesMap.containsKey(data)) { - throw new CellBaseException("Version filename not found for data " + data); + throw new CellBaseException("Version filename not found for data '" + data + "'"); } return dataVersionFilenamesMap.get(data); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 35fcc5a470..a05760f686 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -52,8 +52,10 @@ public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done!"; + protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done!"; + protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; protected String assembly; @@ -141,12 +143,33 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) return hasInfo; } + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String versionFilename = getDataVersionFilename(data); + + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; + } + + @Deprecated protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, String versionFilename, Path outPath) throws IOException, InterruptedException, CellBaseException { return downloadAndSaveDataSource(props, fileId, name, category, null, versionFilename, outPath); } + @Deprecated protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, String chromosome, String versionFilename, Path outPath) throws IOException, InterruptedException, CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 738c66f3f1..0b0d09f412 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -36,22 +36,25 @@ public CaddDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA)) { - return null; + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA) + || !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{}/{} not supported for species {}", getDataCategory(CADD_DATA), getDataName(CADD_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); } - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - Path variationFunctionalScoreFolder = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY); - Files.createDirectories(variationFunctionalScoreFolder); - logger.info("Downloading {} files at {} ...", CADD_NAME, variationFunctionalScoreFolder); + // Create the CADD download path + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Files.createDirectories(caddDownloadPath); - // Download CADD and save data source - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_NAME, - VARIATION_FUNCTIONAL_SCORE_DATA, CADD_VERSION_FILENAME, variationFunctionalScoreFolder); + // Download CADD and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, + caddDownloadPath); - return Collections.singletonList(downloadFile); - } - logger.warn("CADD scores are not supported for {}", speciesConfiguration.getScientificName()); - return Collections.emptyList(); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + return Collections.singletonList(downloadFile); } } From 96078b7e22d63ee03c4c458ef22c8ef90f23c43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 16:13:57 +0200 Subject: [PATCH 057/148] lib: improve the CADD builder by adding checks, log messages, cleaning code and fixing sonnar issues, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 17 +- .../org/opencb/cellbase/lib/EtlCommons.java | 2 + .../lib/builders/CaddScoreBuilder.java | 233 ++++++++---------- .../lib/builders/CellBaseBuilder.java | 7 +- 4 files changed, 126 insertions(+), 133 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 620f1973b2..355e218600 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -264,12 +264,15 @@ private CellBaseBuilder buildRefSeq() { return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); } - private CellBaseBuilder buildCadd() { - Path variationFunctionalScorePath = downloadFolder.resolve("variation_functional_score"); - copyVersionFiles(Arrays.asList(variationFunctionalScorePath.resolve("caddVersion.json"))); - Path caddFilePath = variationFunctionalScorePath.resolve("whole_genome_SNVs.tsv.gz"); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "cadd"); - return new CaddScoreBuilder(caddFilePath, serializer); + private CellBaseBuilder buildCadd() throws CellBaseException { + // Sanity check + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Path caddBuildPath = buildFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + copyVersionFiles(Collections.singletonList(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA))), caddBuildPath); + + // Create the file serializer and the protein builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(caddBuildPath, CADD_DATA); + return new CaddScoreBuilder(caddDownloadPath, serializer); } private CellBaseBuilder buildRevel() { @@ -391,7 +394,7 @@ private CellBaseBuilder buildSplice() throws IOException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws IOException, CellBaseException { + private CellBaseBuilder buildPubMed() throws CellBaseException { // Sanity check Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index da209c66fc..7f0e97d900 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -267,6 +267,8 @@ public final class EtlCommons { public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; // CADD scores public static final String CADD_DATA = "cadd"; + public static final String CADD_RAW_DATA = "cadd_raw"; + public static final String CADD_SCALED_DATA = "cadd_scaled"; // Must match the configuration file public static final String CADD_FILE_ID = "CADD"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index b593f44901..75b35e8a73 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -17,32 +17,33 @@ package org.opencb.cellbase.lib.builders; import org.opencb.biodata.models.core.GenomicScoreRegion; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 06/11/15. */ public class CaddScoreBuilder extends CellBaseBuilder { - private Path caddFilePath; + private Path caddDownloadPath; private static final int CHUNK_SIZE = 1000; private static final int DECIMAL_RESOLUTION = 100; - public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { + public CaddScoreBuilder(Path caddDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.caddFilePath = caddFilePath; - - logger = LoggerFactory.getLogger(ConservationBuilder.class); + this.caddDownloadPath = caddDownloadPath; } /* Example: @@ -57,14 +58,25 @@ public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { */ @Override public void parse() throws Exception { - FileUtils.checkPath(caddFilePath); + String dataName = getDataName(CADD_DATA); + String dataCategory = getDataCategory(CADD_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(caddDownloadPath, dataName); + + // Check ontology files + List caddFiles = checkFiles(dataSourceReader.readValue(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA)).toFile()), + caddDownloadPath, dataName); + if (caddFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + caddFiles.size() + " files"); + } - BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFilePath); List rawValues = new ArrayList<>(CHUNK_SIZE); List scaledValues = new ArrayList<>(CHUNK_SIZE); int start = 1; -// int end = 1999; int end = CHUNK_SIZE - 1; String line; String[] fields = new String[0]; @@ -72,8 +84,8 @@ public void parse() throws Exception { int lineCount = 0; int counter = 1; int serializedChunks = 0; - int previousPosition = 0; - int newPosition = 0; + int prevPos = 0; + int newPos = 0; String chromosome = null; String[] nucleotides = new String[]{"A", "C", "G", "T"}; @@ -81,127 +93,100 @@ public void parse() throws Exception { long scaledLongValue = 0; Map rawScoreValuesMap = new HashMap<>(); Map scaledScoreValuesMap = new HashMap<>(); - while ((line = bufferedReader.readLine()) != null) { - if (!line.startsWith("#")) { - fields = line.split("\t"); - newPosition = Integer.parseInt(fields[1]); -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.debug("line {} reached", line); -// logger.debug("Associated chunk count {}", serializedChunks); -// logger.debug("start {}", start); -// logger.debug("end {}", end); -// logger.debug("chunk size {}", CHUNK_SIZE); -// } - // this only happens the first time, when we start reading the file - if (chromosome == null) { - logger.info("Parsing chr {} ", fields[0]); - chromosome = fields[0]; - - start = newPosition; - previousPosition = newPosition; - end = start + CHUNK_SIZE - 2; - } - if (!chromosome.equals(fields[0])) { - logger.info("Parsing chr {} ", fields[0]); - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - chromosome = fields[0]; - start = newPosition; -// end = CHUNK_SIZE - 1; - end = start + CHUNK_SIZE - 2; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); -// rawLongValue = 0; -// lineCount = 0; -// rawScoreValuesMap.clear(); -// scaledScoreValuesMap.clear(); - // The series of cadd scores is not continuous through the whole chromosome - } else if (end < newPosition || (newPosition - previousPosition) > 1) { - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - start = newPosition; -// start = end + 1; -// end += CHUNK_SIZE; - end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); - } + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFiles.get(0).toPath())) { + while ((line = bufferedReader.readLine()) != null) { + if (!line.startsWith("#")) { + fields = line.split("\t"); + newPos = Integer.parseInt(fields[1]); + String message = "chrom. " + fields[0]; + // This only happens the first time, when we start reading the file + if (chromosome == null) { + logger.info(PARSING_LOG_MESSAGE, message); + chromosome = fields[0]; + + start = newPos; + prevPos = newPos; + end = start + CHUNK_SIZE - 2; + } - rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); - scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); - - if (++lineCount == 3) { -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.info("offset: {}", rawValues.size()); -// } - - for (String nucleotide : nucleotides) { - // raw CADD score values can be negative, we add 10 to make positive - float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; - v = (short) (a * DECIMAL_RESOLUTION); - rawLongValue = (rawLongValue << 16) | v; - - // scaled CADD scores are always positive - a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); - v = (short) (a * DECIMAL_RESOLUTION); - scaledLongValue = (scaledLongValue << 16) | v; + if (!chromosome.equals(fields[0])) { + logger.info(PARSING_LOG_MESSAGE, message); + + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + chromosome = fields[0]; + start = newPos; + end = start + CHUNK_SIZE - 2; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); + // The series of cadd scores is not continuous through the whole chromosome + } else if (end < newPos || (newPos - prevPos) > 1) { + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + start = newPos; + end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); } -// if (rawLongValue < 0 || scaledLongValue < 0) { -// logger.error("raw/scaled Long Values cannot be 0"); -// logger.error("Last read line {}", line); -// System.exit(1); -// } - rawValues.add(rawLongValue); - scaledValues.add(scaledLongValue); - - counter++; - rawLongValue = 0; - lineCount = 0; - rawScoreValuesMap.clear(); - scaledScoreValuesMap.clear(); + rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); + scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); + + if (++lineCount == 3) { + for (String nucleotide : nucleotides) { + // Raw CADD score values can be negative, we add 10 to make positive + float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; + v = (short) (a * DECIMAL_RESOLUTION); + rawLongValue = (rawLongValue << 16) | v; + + // Scaled CADD scores are always positive + a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); + v = (short) (a * DECIMAL_RESOLUTION); + scaledLongValue = (scaledLongValue << 16) | v; + } + + rawValues.add(rawLongValue); + scaledValues.add(scaledLongValue); + + counter++; + rawLongValue = 0; + lineCount = 0; + rawScoreValuesMap.clear(); + scaledScoreValuesMap.clear(); + } + prevPos = newPos; } - previousPosition = newPosition; } - } - // Last chunks can be incomplete for both raw and scaled are serialized -// GenomicScoreRegion genomicScoreRegion = -// new GenomicScoreRegion<>(fields[0], start, start + rawValues.size() - 1, "cadd_raw", rawValues); - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); + // Last chunks can be incomplete for both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_RAW_DATA, rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); -// genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, start + scaledValues.size() - 1, "cadd_scaled", scaledValues); - genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); + serializer.close(); + } - serializer.close(); - bufferedReader.close(); - logger.info("Parsing finished."); + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 49d847c033..f5e79320d7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -44,8 +44,11 @@ public abstract class CellBaseBuilder { public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done."; - public static final String PARSING_LOG_MESSAGE = "Parsing file {} ..."; - public static final String PARSING_DONE_LOG_MESSAGE = "Parsing file {} done."; + public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building {}/{} done."; + + public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing {} done."; public CellBaseBuilder(CellBaseSerializer serializer) { From 3163a90cfbfda44ec205279cbf21f2b947cc10a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 16:47:16 +0200 Subject: [PATCH 058/148] lib: update the REVEL downloader according to the last changes, and add log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 8 +++-- .../MissenseScoresDownloadManager.java | 33 ++++++++++++++----- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 7f0e97d900..28a349e028 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -177,11 +177,9 @@ public final class EtlCommons { public static final String PHARMGKB_RELATIONSHIPS_FILE_ID = "RELATIONSHIPS"; // Missense variantion functional score - public static final String MISSENSE_VARIATION_SCORE_NAME = "Missense Variation Functional Scores"; public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; // Revel - public static final String REVEL_NAME = "Revel"; - public static final String REVEL_VERSION_FILENAME = "revel" + SUFFIX_VERSION_FILENAME; + public static final String REVEL_DATA = "revel"; // Must match the configuration file public static final String REVEL_FILE_ID = "REVEL"; @@ -385,14 +383,18 @@ public final class EtlCommons { dataNamesMap.put(PUBMED_DATA, "PubMed"); dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); dataNamesMap.put(CADD_DATA, "CADD"); + dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Scores"); + dataNamesMap.put(REVEL_DATA, "Revel"); // Populate data categories map dataCategoriesMap.put(PUBMED_DATA, "Publication"); dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); + dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index ca491a97fe..b2c102a10e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -37,18 +37,33 @@ public MissenseScoresDownloadManager(String species, String assembly, Path targe @Override public List download() throws IOException, InterruptedException, CellBaseException { - return Collections.singletonList(downloadRevel()); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + + DownloadFile downloadFile = downloadRevel(); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + + return Collections.singletonList(downloadFile); } public DownloadFile downloadRevel() throws IOException, InterruptedException, CellBaseException { - if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - Files.createDirectories(missensePredictionScorePath); - - logger.info("Downloading {}/{} ...", MISSENSE_VARIATION_SCORE_NAME, REVEL_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_NAME, - MISSENSE_VARIATION_SCORE_DATA, REVEL_VERSION_FILENAME, missensePredictionScorePath); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + if (!speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { + logger.info("{}/{} not supported for species {}", getDataCategory(REVEL_DATA), getDataName(REVEL_DATA), + speciesConfiguration.getScientificName()); + return null; } - return null; + + // Create the REVEL download path + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Files.createDirectories(revelDownloadPath); + + // Download REVEL and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_DATA, + revelDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + + return downloadFile; } } From bc22fadd56a701b8ed5012dfa0485603323892a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 08:14:40 +0200 Subject: [PATCH 059/148] lib: add log messages, #TASK-5776, #TASK-5564 --- .../java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index 75b35e8a73..d0597c4c2a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -94,6 +94,7 @@ public void parse() throws Exception { Map rawScoreValuesMap = new HashMap<>(); Map scaledScoreValuesMap = new HashMap<>(); + logger.info(PARSING_LOG_MESSAGE, caddFiles.get(0)); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFiles.get(0).toPath())) { while ((line = bufferedReader.readLine()) != null) { if (!line.startsWith("#")) { @@ -186,6 +187,7 @@ public void parse() throws Exception { serializer.close(); } + logger.info(PARSING_DONE_LOG_MESSAGE, caddFiles.get(0)); logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } From 0c9a29958198d13e925b9219a9b438171a814d59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 08:16:44 +0200 Subject: [PATCH 060/148] lib: improve the Revel builder by fixing sonnar issues and adding checks and log messages, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 14 +- .../lib/builders/RevelScoreBuilder.java | 126 ++++++++++-------- 2 files changed, 82 insertions(+), 58 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 355e218600..f1fdcbbb19 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -275,11 +275,15 @@ private CellBaseBuilder buildCadd() throws CellBaseException { return new CaddScoreBuilder(caddDownloadPath, serializer); } - private CellBaseBuilder buildRevel() { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - copyVersionFiles(Arrays.asList(missensePredictionScorePath.resolve("revelVersion.json"))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - return new RevelScoreBuilder(missensePredictionScorePath, serializer); + private CellBaseBuilder buildRevel() throws CellBaseException { + // Sanity check + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Path revelBuildPath = buildFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + copyVersionFiles(Collections.singletonList(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA))), revelBuildPath); + + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(revelBuildPath, REVEL_DATA); + return new RevelScoreBuilder(revelDownloadPath, serializer); } private CellBaseBuilder buildRegulation() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 2ccf0cb2a1..68c6128f25 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -19,8 +19,8 @@ import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.Path; @@ -30,75 +30,95 @@ import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class RevelScoreBuilder extends CellBaseBuilder { - private Path revelFilePath = null; - private static final String SOURCE = "revel"; + private Path revelDownloadPath = null; - public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { + public RevelScoreBuilder(Path revelDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.revelFilePath = revelDirectoryPath.resolve("revel-v1.3_all_chromosomes.zip"); - logger = LoggerFactory.getLogger(ConservationBuilder.class); - + this.revelDownloadPath = revelDownloadPath; } @Override - public void parse() throws IOException { - logger.error("processing Revel file at " + revelFilePath.toAbsolutePath()); - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFilePath))); + public void parse() throws IOException, CellBaseException { + String dataName = getDataName(REVEL_DATA); + String dataCategory = getDataCategory(REVEL_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(revelDownloadPath, dataName); + + // Check ontology files + List revelFiles = checkFiles(dataSourceReader.readValue(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA)) + .toFile()), revelDownloadPath, dataName); + if (revelFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + revelFiles.size() + " files"); + } + + ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelDownloadPath))); ZipEntry zipEntry = zis.getNextEntry(); - ZipFile zipFile = new ZipFile(String.valueOf(revelFilePath)); - InputStream inputStream = zipFile.getInputStream(zipEntry); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); - - // skip header - String line = bufferedReader.readLine(); - String[] fields = null; - String lastEntry = null; - String currentEntry = null; - List scores = new ArrayList<>(); - MissenseVariantFunctionalScore predictions = null; - while ((line = bufferedReader.readLine()) != null) { - fields = line.split(","); - String chromosome = fields[0]; - if (".".equalsIgnoreCase(fields[2])) { - // 1,12855835,.,C,A,A,D,0.175 - // skip if invalid position - continue; - } - int position = Integer.parseInt(fields[2]); - String reference = fields[3]; - String alternate = fields[4]; - String aaReference = fields[5]; - String aaAlternate = fields[6]; - double score = Double.parseDouble(fields[7]); - - currentEntry = chromosome + position; - - // new chromosome + position, store previous entry - if (lastEntry != null && !currentEntry.equals(lastEntry)) { - serializer.serialize(predictions); - scores = new ArrayList<>(); - predictions = null; - } + logger.info(PARSING_LOG_MESSAGE, revelFiles.get(0)); - if (predictions == null) { - predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, SOURCE, scores); + ZipFile zipFile = new ZipFile(revelFiles.get(0).toString()); + InputStream inputStream = zipFile.getInputStream(zipEntry); + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) { + // Skip header + bufferedReader.readLine(); + String[] fields; + String lastEntry = null; + String currentEntry; + List scores = new ArrayList<>(); + MissenseVariantFunctionalScore predictions = null; + String line; + while ((line = bufferedReader.readLine()) != null) { + fields = line.split(","); + String chromosome = fields[0]; + if (".".equalsIgnoreCase(fields[2])) { + // 1,12855835,.,C,A,A,D,0.175 + // skip if invalid position + continue; + } + int position = Integer.parseInt(fields[2]); + String reference = fields[3]; + String alternate = fields[4]; + String aaReference = fields[5]; + String aaAlternate = fields[6]; + double score = Double.parseDouble(fields[7]); + + currentEntry = chromosome + position; + + // new chromosome + position, store previous entry + if (lastEntry != null && !currentEntry.equals(lastEntry)) { + serializer.serialize(predictions); + scores = new ArrayList<>(); + predictions = null; + } + + if (predictions == null) { + predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, REVEL_DATA, scores); + } + + TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", alternate, + aaReference, aaAlternate, score); + scores.add(predictedScore); + lastEntry = chromosome + position; } - TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", - alternate, aaReference, aaAlternate, score); - scores.add(predictedScore); - lastEntry = chromosome + position; + // Serialise last entry + serializer.serialize(predictions); } - // serialise last entry - serializer.serialize(predictions); + logger.info(PARSING_DONE_LOG_MESSAGE, revelFiles.get(0)); + // Close zis.close(); zipFile.close(); inputStream.close(); - bufferedReader.close(); + + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } From 4f9e39a057b1d5ea42f8f6b36984731f4378857b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 11:51:43 +0200 Subject: [PATCH 061/148] lib: update CellBase downloaders according to the last changes, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 273 +++++++++++------- .../lib/download/AbstractDownloadManager.java | 19 ++ .../lib/download/GeneDownloadManager.java | 210 ++++++++------ .../lib/download/GenomeDownloadManager.java | 66 ++--- .../lib/download/OntologyDownloadManager.java | 24 +- .../lib/download/PharmGKBDownloadManager.java | 13 +- .../lib/download/ProteinDownloadManager.java | 18 +- .../download/RegulationDownloadManager.java | 39 +-- 8 files changed, 383 insertions(+), 279 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 28a349e028..f2cc152005 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -21,6 +21,7 @@ import org.apache.logging.log4j.core.config.Configurator; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,6 +37,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; /** * Created by fjlopez on 03/06/16. @@ -43,7 +45,7 @@ public final class EtlCommons { // Ensembl - public static final String ENSEMBL_NAME = "Ensembl"; + public static final String ENSEMBL_DATA = "ensembl"; public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; public static final String PUT_CAPITAL_SPECIES_HERE_MARK = "put_capital_species_here"; @@ -67,88 +69,65 @@ public final class EtlCommons { public static final String SUFFIX_VERSION_FILENAME = "Version.json"; - // Genome (Ensembl) - public static final String GENOME_NAME = "Genome"; + // Genome public static final String GENOME_DATA = "genome"; - public static final String GENOME_SUBDIRECTORY = GENOME_DATA; - public static final String GENOME_VERSION_FILENAME = GENOME_DATA + SUFFIX_VERSION_FILENAME; - // Gene (Ensembl) + // Gene public static final String GENE_DATA = "gene"; - public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; + public static final String GENE_ANNOTATION_DATA = "gene_annotation"; + public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; // RefSeq - public static final String REFSEQ_NAME = "RefSeq"; public static final String REFSEQ_DATA = "refseq"; - public static final String REFSEQ_VERSION_FILENAME = "refSeq" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; public static final String REFSEQ_PROTEIN_FAA_FILE_ID = "PROTEIN_FAA"; public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; - // MANE Select - public static final String MANE_SELECT_NAME = "MANE Select"; - public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME; + // Gene annotation + // - MANE Select + public static final String MANE_SELECT_DATA = "MANE Select"; // Must match the configuration file public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; - - // LRG - public static final String LRG_NAME = "LRG"; - public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME; + // - LRG + public static final String LRG_DATA = "lrg"; // Must match the configuration file public static final String LRG_FILE_ID = "LRG"; - - // HGNC - public static final String HGNC_NAME = "HGNC Gene"; - public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME; + // - HGNC + public static final String HGNC_DATA = "hgnc"; // Must match the configuration file public static final String HGNC_FILE_ID = "HGNC"; - - // Cancer HotSpot - public static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; - public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotSpot" + SUFFIX_VERSION_FILENAME; + // - Cancer HotSpot + public static final String CANCER_HOTSPOT_DATA = "cancer_hotspot"; // Must match the configuration file public static final String CANCER_HOTSPOT_FILE_ID = "CANCER_HOTSPOT"; - - // DGID (drug) - public static final String DGIDB_NAME = "DGIdb"; - public static final String DGIDB_VERSION_FILENAME = "dgidb" + SUFFIX_VERSION_FILENAME; + // - DGID (drug) + public static final String DGIDB_DATA = "dgidb"; // Must match the configuration file public static final String DGIDB_FILE_ID = "DGIDB"; - - // UniProt Xref - public static final String UNIPROT_XREF_NAME = "UniProt Xref"; - public static final String UNIPROT_XREF_VERSION_FILENAME = "uniprotXref" + SUFFIX_VERSION_FILENAME; + // - UniProt Xref + public static final String UNIPROT_XREF_DATA = "uniprot_xref"; // Must match the configuration file public static final String UNIPROT_XREF_FILE_ID = "UNIPROT_XREF"; - - // Gene Expression Atlas - public static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; - public static final String GENE_EXPRESSION_ATLAS_VERSION_FILENAME = "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME; + // - Gene Expression Atlas + public static final String GENE_EXPRESSION_ATLAS_DATA = "gene_expression_atlas"; // Must match the configuration file public static final String GENE_EXPRESSION_ATLAS_FILE_ID = "GENE_EXPRESSION_ATLAS"; - - // Gene Disease Annotation + // - Gene Disease Annotation public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; - // HPO - public static final String HPO_NAME = "HPO"; - public static final String HPO_VERSION_FILENAME = "hpo" + SUFFIX_VERSION_FILENAME; - // DISGENET - public static final String DISGENET_NAME = "DisGeNet"; - public static final String DISGENET_VERSION_FILENAME = "disGeNet" + SUFFIX_VERSION_FILENAME; + // - HPO + public static final String HPO_DATA = "hpo"; + // - DISGENET + public static final String DISGENET_DATA = "disgenet"; // Must match the configuration file public static final String DISGENET_FILE_ID = "DISGENET"; - - // gnomAD Constraints - public static final String GNOMAD_CONSTRAINTS_NAME = "gnomAD Constraints"; - public static final String GNOMAD_CONSTRAINTS_VERSION_FILENAME = "gnomadConstraints" + SUFFIX_VERSION_FILENAME; + // - gnomAD Constraints + public static final String GNOMAD_CONSTRAINTS_DATA = "gnomad_constraints"; // Must match the configuration file public static final String GNOMAD_CONSTRAINTS_FILE_ID = "GNOMAD_CONSTRAINTS"; - - // GO Annotation - public static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; - public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME; + // - GO Annotation + public static final String GO_ANNOTATION_DATA = "go_annotation"; // Must match the configuration file public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; @@ -157,14 +136,9 @@ public final class EtlCommons { public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics - public static final String PHARMACOGENOMICS_NAME = "Pharmacogenomics"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; - public static final String PHARMACOGENOMICS_SUBDIRECTORY = "pharmacogenomics"; // PharmGKB - public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = "pharmGKB" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PHARMGKB_GENES_FILE_ID = "GENES"; public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; @@ -211,50 +185,42 @@ public final class EtlCommons { public static final String GWAS_FILE_ID = "GWAS"; // Repeats - public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; - public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY; /** * @deprecated (when refactoring downloaders, builders and loaders) */ @Deprecated public static final String REPEATS_JSON = "repeats"; // Simple repeats - public static final String TRF_NAME = "Tandem Repeats Finder"; - public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; + public static final String TRF_DATA = "trf"; + // Must match the configuration file public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; // Genomic super duplications - public static final String GSD_NAME = "Genomic Super Duplications"; - public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; + public static final String GSD_DATA = "gsd"; + // Must match the configuration file public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; // Window masker - public static final String WM_NAME = "Window Masker"; - public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; + public static final String WM_DATA = "wm"; + // Must match the configuration file public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; // Ontology - public static final String ONTOLOGY_NAME = "Ontology"; public static final String ONTOLOGY_DATA = "ontology"; - public static final String ONTOLOGY_SUBDIRECTORY = ONTOLOGY_DATA; public static final String OBO_BASENAME = "ontology"; // HPO - public static final String HPO_OBO_NAME = "HPO"; - public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; + public static final String HPO_OBO_DATA = "hpo"; // Must match the configuration file public static final String HPO_OBO_FILE_ID = "HPO"; // GO - public static final String GO_OBO_NAME = "GO"; - public static final String GO_OBO_VERSION_FILENAME = "goObo" + SUFFIX_VERSION_FILENAME; + public static final String GO_OBO_DATA = "go"; // Must match the configuration file public static final String GO_OBO_FILE_ID = "GO"; // DOID - public static final String DOID_OBO_NAME = "DOID"; - public static final String DOID_OBO_VERSION_FILENAME = "doidObo" + SUFFIX_VERSION_FILENAME; + public static final String DOID_OBO_DATA = "doid"; // Must match the configuration file public static final String DOID_OBO_FILE_ID = "DOID"; // MONDO - public static final String MONDO_OBO_NAME = "Mondo"; - public static final String MONDO_OBO_VERSION_FILENAME = "mondoObo" + SUFFIX_VERSION_FILENAME; + public static final String MONDO_OBO_DATA = "mondo"; // Must match the configuration file public static final String MONDO_OBO_FILE_ID = "MONDO"; @@ -271,79 +237,54 @@ public final class EtlCommons { public static final String CADD_FILE_ID = "CADD"; // Regulation - public static final String REGULATION_NAME = "Regulation"; public static final String REGULATION_DATA = "regulation"; - public static final String REGULATION_SUBDIRECTORY = REGULATION_DATA; public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) - public static final String REGULATORY_BUILD_NAME = "Regulatory Build"; - public static final String REGULATORY_BUILD_VERSION_FILENAME = "regulatoryBuild" + SUFFIX_VERSION_FILENAME; + public static final String REGULATORY_BUILD_DATA = "regulatory_build"; // Motif features (see Ensembl files) - public static final String MOTIF_FEATURES_NAME = "Motif Features"; - public static final String MOTIF_FEATURES_VERSION_FILENAME = "motifFeatures" + SUFFIX_VERSION_FILENAME; + public static final String MOTIF_FEATURES_DATA = "motif_features"; // miRBase - public static final String MIRBASE_NAME = "miRBase"; - public static final String MIRBASE_VERSION_FILENAME = "mirbase" + SUFFIX_VERSION_FILENAME; + public static final String MIRBASE_DATA = "mirbase"; // Must match the configuration file public static final String MIRBASE_FILE_ID = "MIRBASE"; // miRTarBase - public static final String MIRTARBASE_NAME = "miRTarBase"; - public static final String MIRTARBASE_VERSION_FILENAME = "mirTarBase" + SUFFIX_VERSION_FILENAME; + public static final String MIRTARBASE_DATA = "mirtarbase"; // Must match the configuration file public static final String MIRTARBASE_FILE_ID = "MIRTARBASE"; - // Build specific data options - public static final String GENOME_INFO_DATA = "genome_info"; - public static final String DISGENET_DATA = "disgenet"; - public static final String HPO_DATA = "hpo"; - public static final String PPI_DATA = "ppi"; - public static final String DRUG_DATA = "drug"; - // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; // Protein - public static final String PROTEIN_NAME = "Protein"; public static final String PROTEIN_DATA = "protein"; - public static final String PROTEIN_SUBDIRECTORY = "protein"; // UniProt - public static final String UNIPROT_NAME = "UniProt"; + public static final String UNIPROT_DATA = "uniprot"; public static final String UNIPROT_CHUNKS_SUBDIRECTORY = "uniprot_chunks"; - public static final String UNIPROT_VERSION_FILENAME = "uniprot" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String UNIPROT_FILE_ID = "UNIPROT"; // InterPro - public static final String INTERPRO_NAME = "InterPro"; - public static final String INTERPRO_VERSION_FILENAME = "interpro" + SUFFIX_VERSION_FILENAME; + public static final String INTERPRO_DATA = "interpro"; // Must match the configuration file public static final String INTERPRO_FILE_ID = "INTERPRO"; // IntAct - public static final String INTACT_NAME = "IntAct"; - public static final String INTACT_VERSION_FILENAME = "intact" + SUFFIX_VERSION_FILENAME; + public static final String INTACT_DATA = "intact"; // Must match the configuration file public static final String INTACT_FILE_ID = "INTACT"; // Conservation scores - public static final String CONSERVATION_NAME = "Conservation"; public static final String CONSERVATION_DATA = "conservation"; - public static final String CONSERVATION_SUBDIRECTORY = "conservation"; // GERP - public static final String GERP_NAME = "GERP++"; - public static final String GERP_SUBDIRECTORY = "gerp"; - public static final String GERP_VERSION_FILENAME = "gerp" + SUFFIX_VERSION_FILENAME; + public static final String GERP_DATA = "gerp"; + // Must match the configuration file public static final String GERP_FILE_ID = "GERP"; // PHASTCONS - public static final String PHASTCONS_NAME = "PhastCons"; public static final String PHASTCONS_DATA = "phastCons"; - public static final String PHASTCONS_SUBDIRECTORY = PHASTCONS_DATA; - public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_DATA + SUFFIX_VERSION_FILENAME; + // Must match the configuration file public static final String PHASTCONS_FILE_ID = "PHASTCONS"; // PHYLOP - public static final String PHYLOP_NAME = "PhyloP"; public static final String PHYLOP_DATA = "phylop"; - public static final String PHYLOP_SUBDIRECTORY = PHYLOP_DATA; - public static final String PHYLOP_VERSION_FILENAME = PHYLOP_DATA + SUFFIX_VERSION_FILENAME; + // Must match the configuration file public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores @@ -380,19 +321,125 @@ public final class EtlCommons { static { // Populate data names map + dataNamesMap.put(ENSEMBL_DATA, "Ensembl"); + dataNamesMap.put(REFSEQ_DATA, "RefSeq"); + dataNamesMap.put(GENOME_DATA, "Genome"); + dataNamesMap.put(GENE_DATA, "Gene"); + dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); + dataNamesMap.put(LRG_DATA, "LRG"); + dataNamesMap.put(HGNC_DATA, "HGNC Gene"); + dataNamesMap.put(CANCER_HOTSPOT_DATA, "Cancer HotSpot"); + dataNamesMap.put(DGIDB_DATA, "DGIdb"); + dataNamesMap.put(UNIPROT_XREF_DATA, "UniProt Xref"); + dataNamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "Gene Expression Atlas"); + dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); + dataNamesMap.put(HPO_DATA, "HPO"); + dataNamesMap.put(DISGENET_DATA, "DisGeNet"); + dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraints"); + dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); + dataNamesMap.put(PROTEIN_DATA, "Protein"); + dataNamesMap.put(UNIPROT_DATA, "UniProt"); + dataNamesMap.put(INTERPRO_DATA, "InterPro"); + dataNamesMap.put(INTACT_DATA, "IntAct"); + dataNamesMap.put(CONSERVATION_DATA, "Conservation"); + dataNamesMap.put(GERP_DATA, "GERP++"); + dataNamesMap.put(PHASTCONS_DATA, "PhastCons"); + dataNamesMap.put(PHYLOP_DATA, "PhyloP"); + dataNamesMap.put(REPEATS_DATA, "Repeats"); + dataNamesMap.put(TRF_DATA, "Tandem Repeats Finder"); + dataNamesMap.put(WM_DATA, "Window Masker"); + dataNamesMap.put(GSD_DATA, "Genomic Super Duplications"); + dataNamesMap.put(REGULATION_DATA, "Regulation"); + dataNamesMap.put(REGULATORY_BUILD_DATA, "Regulatory Build"); + dataNamesMap.put(MOTIF_FEATURES_DATA, "Motif Features"); + dataNamesMap.put(MIRBASE_DATA, "miRBase"); + dataNamesMap.put(MIRTARBASE_DATA, "miRTarBase"); + dataNamesMap.put(ONTOLOGY_DATA, "Ontology"); + dataNamesMap.put(HPO_OBO_DATA, "HPO"); + dataNamesMap.put(GO_OBO_DATA, "GO"); + dataNamesMap.put(DOID_OBO_DATA, "DOID"); + dataNamesMap.put(MONDO_OBO_DATA, "Mondo"); dataNamesMap.put(PUBMED_DATA, "PubMed"); + dataNamesMap.put(PHARMACOGENOMICS_DATA, "Pharmacogenomics"); + dataNamesMap.put(PHARMGKB_DATA, "PharmGKB"); dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); dataNamesMap.put(CADD_DATA, "CADD"); dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Scores"); dataNamesMap.put(REVEL_DATA, "Revel"); // Populate data categories map + dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataCategoriesMap.put(GENOME_DATA, dataNamesMap.get(ENSEMBL_DATA)); + dataCategoriesMap.put(MANE_SELECT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(LRG_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HGNC_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(CANCER_HOTSPOT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DGIDB_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_XREF_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GENE_EXPRESSION_ATLAS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HPO_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DISGENET_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GNOMAD_CONSTRAINTS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GO_ANNOTATION_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTERPRO_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTACT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(GERP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHASTCONS_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHYLOP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(TRF_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(WM_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(GSD_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(REGULATORY_BUILD_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MOTIF_FEATURES_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRTARBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(HPO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(GO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(DOID_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(MONDO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); dataCategoriesMap.put(PUBMED_DATA, "Publication"); + dataCategoriesMap.put(PHARMGKB_DATA, dataNamesMap.get(PHARMACOGENOMICS_DATA)); dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); // Populate data version filenames Map + dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REFSEQ_DATA, "refSeqCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENOME_DATA, "genome" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MANE_SELECT_DATA, "maneSelect" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(LRG_DATA, "lrg" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGNC_DATA, "hgnc" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CANCER_HOTSPOT_DATA, "cancerHotSpot" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DGIDB_DATA, "dgidb" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_XREF_DATA, "uniProtXref" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_DATA, "hpo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DISGENET_DATA, "disGeNet" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomadConstraints" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_ANNOTATION_DATA, "goAnnotation" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_DATA, "uniProt" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTERPRO_DATA, "interPro" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTACT_DATA, "intAct" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GERP_DATA, "gerp" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHASTCONS_DATA, "phastCons" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHYLOP_DATA, "phyloP" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(TRF_DATA, "simpleRepeat" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(WM_DATA, "windowMasker" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GSD_DATA, "genomicSuperDups" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REGULATORY_BUILD_DATA, "regulatoryBuild" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MOTIF_FEATURES_DATA, "motifFeatures" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRBASE_DATA, "mirBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRTARBASE_DATA, "mirTarBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_OBO_DATA, "hpoObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_OBO_DATA, "goObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DOID_OBO_DATA, "doidObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MONDO_OBO_DATA, "mondoObo" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHARMGKB_DATA, "pharmGKB" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); } @@ -593,4 +640,8 @@ public static String getDataVersionFilename(String data) throws CellBaseExceptio } return dataVersionFilenamesMap.get(data); } + + public static List getUrls(List downloadFiles) { + return downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index a05760f686..7c4e331f18 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -183,6 +183,25 @@ protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLPropertie return downloadFile; } + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + Path outPath) throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveEnsemblDataSource(ensemblProps, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(getDataVersionFilename(data))); + + return downloadFile; + } + + @Deprecated protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String name, String category, String chromosome, String versionFilename, Path outPath) throws IOException, InterruptedException, CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 7ea434c24c..ee332dd8ea 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -17,14 +17,13 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -49,182 +48,223 @@ public GeneDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading gene information ..."); - Path geneFolder = downloadFolder.resolve("gene"); - Files.createDirectories(geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DATA)); - Path refseqFolder = downloadFolder.resolve("refseq"); - Files.createDirectories(refseqFolder); + // Create gene folder + Path geneDownloadPath = downloadFolder.resolve(GENE_DATA); + + // Create Ensembl folder + Path ensemblDownloadPath = geneDownloadPath.resolve(ENSEMBL_DATA); + Files.createDirectories(ensemblDownloadPath); + + // Create RefSeq folder + Path refSeqDownloadPath = geneDownloadPath.resolve(REFSEQ_DATA); + Files.createDirectories(refSeqDownloadPath); List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadEnsemblData(geneFolder)); - downloadFiles.addAll(downloadRefSeq(refseqFolder)); - downloadFiles.add(downloadMane(geneFolder)); - downloadFiles.add(downloadLrg(geneFolder)); - downloadFiles.add(downloadHgnc(geneFolder)); - downloadFiles.add(downloadCancerHotspot(geneFolder)); - downloadFiles.add(downloadDrugData(geneFolder)); - downloadFiles.add(downloadGeneUniprotXref(geneFolder)); - downloadFiles.add(downloadGeneExpressionAtlas(geneFolder)); - downloadFiles.add(downloadGeneDiseaseAnnotation(geneFolder)); - downloadFiles.add(downloadGnomadConstraints(geneFolder)); - downloadFiles.add(downloadGO(geneFolder)); + // Ensembl + downloadFiles.addAll(downloadEnsemblData(ensemblDownloadPath)); + + // RefSeq + downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath)); + + // Gene annotation + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + downloadFiles.add(downloadMane(geneDownloadPath)); + downloadFiles.add(downloadLrg(geneDownloadPath)); + downloadFiles.add(downloadHgnc(geneDownloadPath)); + downloadFiles.add(downloadCancerHotspot(geneDownloadPath)); + downloadFiles.add(downloadDrugData(geneDownloadPath)); + downloadFiles.add(downloadGeneUniprotXref(geneDownloadPath)); + downloadFiles.add(downloadGeneExpressionAtlas(geneDownloadPath)); + downloadFiles.add(downloadGeneDiseaseAnnotation(geneDownloadPath)); + downloadFiles.add(downloadGnomadConstraints(geneDownloadPath)); + downloadFiles.add(downloadGO(geneDownloadPath)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); return downloadFiles; } - private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, ENSEMBL_NAME); + private List downloadEnsemblData(Path ensemblDownloadPath) throws IOException, InterruptedException, CellBaseException { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); List downloadFiles = new ArrayList<>(); + DownloadProperties.EnsemblProperties ensemblProps = configuration.getDownload().getEnsembl(); // GTF - downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_GTF_FILE_ID, geneFolder)); + downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_GTF_FILE_ID, ensemblDownloadPath)); // PEP - downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PEP_FA_FILE_ID, geneFolder)); + downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_PEP_FA_FILE_ID, ensemblDownloadPath)); // CDNA - downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_CDNA_FA_FILE_ID, geneFolder)); + downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_CDNA_FA_FILE_ID, ensemblDownloadPath)); // Save data source (i.e., metadata) - saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), - downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), - geneFolder.resolve(ENSEMBL_CORE_VERSION_FILENAME)); + saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), getUrls(downloadFiles), + ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ENSEMBL_NAME); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); return downloadFiles; } - private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException, CellBaseException { + private List downloadRefSeq(Path refSeqDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, REFSEQ_NAME); + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); List downloadFiles = new ArrayList<>(); + DownloadProperties.URLProperties refSeqProps = configuration.getDownload().getRefSeq(); // GTF - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_GTF_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); // Genomic FASTA - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_FNA_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_GENOMIC_FNA_FILE_ID, refSeqDownloadPath)); // Protein FASTA - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_PROTEIN_FAA_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_PROTEIN_FAA_FILE_ID, refSeqDownloadPath)); // cDNA - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_RNA_FNA_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_RNA_FNA_FILE_ID, refSeqDownloadPath)); // Save data source (i.e., metadata) - saveDataSource(REFSEQ_NAME, GENE_DATA, configuration.getDownload().getRefSeq().getVersion(), getTimeStamp(), - downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), - refSeqFolder.resolve(REFSEQ_VERSION_FILENAME)); + saveDataSource(REFSEQ_DATA, refSeqProps.getVersion(), getTimeStamp(), getUrls(downloadFiles), + refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REFSEQ_NAME); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); return downloadFiles; } return Collections.emptyList(); } - private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, MANE_SELECT_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, - MANE_SELECT_NAME, GENE_DATA, MANE_SELECT_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, MANE_SELECT_NAME); + MANE_SELECT_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); return downloadFile; } return null; } - private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, LRG_NAME); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, - LRG_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, LRG_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, + geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(LRG_DATA)); return downloadFile; } return null; } - private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, HGNC_NAME); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, - HGNC_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, HGNC_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, + geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(HGNC_DATA)); return downloadFile; } return null; } - private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, - CANCER_HOTSPOT_NAME, GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + CANCER_HOTSPOT_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); return downloadFile; } return null; } - private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, DGIDB_NAME); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, - GENE_DATA, DGIDB_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, DGIDB_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, + geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(DGIDB_DATA)); return downloadFile; } return null; } - private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - logger.info(DOWNLOADING_LOG_MESSAGE, UNIPROT_XREF_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, - UNIPROT_XREF_NAME, GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, UNIPROT_XREF_NAME); + UNIPROT_XREF_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); return downloadFile; } return null; } - private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), - GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); return downloadFile; } - private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); + private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); + // HPO // IMPORTANT !!! logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", - HPO_NAME, configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); - saveDataSource(HPO_NAME, GENE_DISEASE_ANNOTATION_NAME, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), - Collections.singletonList(configuration.getDownload().getHpo().getHost()), geneFolder.resolve(HPO_VERSION_FILENAME)); + getDataName(HPO_DATA), configuration.getDownload().getHpo().getHost(), getDataVersionFilename(HPO_DATA), + getDataCategory(HPO_DATA), getDataName(HPO_DATA)); + saveDataSource(HPO_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(configuration.getDownload().getHpo().getHost()), + geneDownloadPath.resolve(getDataVersionFilename(HPO_DATA))); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, - GENE_DISEASE_ANNOTATION_NAME, DISGENET_VERSION_FILENAME, geneFolder); + // DisGeNet + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_DATA, + geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); return downloadFile; } - private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, GNOMAD_CONSTRAINTS_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, - GNOMAD_CONSTRAINTS_NAME, GENE_DATA, GNOMAD_CONSTRAINTS_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), + GNOMAD_CONSTRAINTS_FILE_ID, GNOMAD_CONSTRAINTS_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); + return downloadFile; } return null; } - private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, GO_ANNOTATION_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, GO_ANNOTATION_NAME, - GENE_DATA, GO_ANNOTATION_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, + GO_ANNOTATION_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); + return downloadFile; } return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 210d5bc39f..289ec23258 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -19,7 +19,6 @@ import com.beust.jcommander.ParameterException; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -44,8 +43,8 @@ public List download() throws IOException, InterruptedException, C } public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, GENOME_NAME); - Path sequenceFolder = downloadFolder.resolve(GENOME_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); + Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); Files.createDirectories(sequenceFolder); // Reference genome sequences are downloaded from Ensembl @@ -54,10 +53,10 @@ public List downloadReferenceGenome() throws IOException, Interrup sequenceFolder); // Save data source - saveDataSource(ENSEMBL_NAME, EtlCommons.GENOME_DATA, ensemblVersion, getTimeStamp(), - Collections.singletonList(downloadFile.getUrl()), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENOME_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); return Collections.singletonList(downloadFile); } @@ -75,13 +74,13 @@ public List downloadConservation() throws IOException, Interrupted } List downloadFiles = new ArrayList<>(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, CONSERVATION_NAME); - Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(GERP_SUBDIRECTORY)); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); + Files.createDirectories(conservationFolder.resolve(GERP_DATA)); + Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); + Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; @@ -93,14 +92,14 @@ public List downloadConservation() throws IOException, Interrupted List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); // Downloading PhastCons and PhyloP - logger.info(DOWNLOADING_LOG_MESSAGE, (PHASTCONS_NAME + "/" + PHYLOP_NAME)); + logger.info(DOWNLOADING_LOG_MESSAGE, (getDataName(PHASTCONS_DATA) + "/" + getDataName(PHYLOP_DATA))); for (String chromosome : chromosomes) { // PhastCons String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() .getFiles().get(PHASTCONS_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve(filename); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); phastconsUrls.add(phastConsUrl); @@ -110,30 +109,30 @@ public List downloadConservation() throws IOException, Interrupted .getFiles().get(PHYLOP_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_SUBDIRECTORY).resolve(filename); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); phyloPUrls.add(phyloPUrl); } // Downloading Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, GERP_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() .get(GERP_FILE_ID); filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_SUBDIRECTORY).resolve(filename); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); // Save data version - saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), - getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); - saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), - getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); - saveDataSource(GERP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), - Collections.singletonList(gerpUrl), conservationFolder.resolve(GERP_VERSION_FILENAME)); + saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, + conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); + saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, + conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); + saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA))); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CONSERVATION_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } return downloadFiles; @@ -144,8 +143,8 @@ public List downloadRepeats() throws IOException, InterruptedExcep return Collections.emptyList(); } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, REPEATS_NAME); - Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); Files.createDirectories(repeatsFolder); List downloadFiles = new ArrayList<>(); String pathParam; @@ -160,36 +159,33 @@ public List downloadRepeats() throws IOException, InterruptedExcep // Download tandem repeat finder String url = configuration.getDownload().getSimpleRepeats().getHost() + configuration.getDownload().getSimpleRepeats() .getFiles().get(SIMPLE_REPEATS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveDataSource(TRF_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); - Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); // Download genomic super duplications url = configuration.getDownload().getGenomicSuperDups().getHost() + configuration.getDownload().getGenomicSuperDups() .getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveDataSource(GSD_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), - getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); // Download WindowMasker if (!pathParam.equalsIgnoreCase(HG19_NAME)) { url = configuration.getDownload().getWindowMasker().getHost() + configuration.getDownload().getWindowMasker().getFiles() .get(WINDOW_MASKER_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveDataSource(WM_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getWindowMasker().getVersion(), - getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REPEATS_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); return downloadFiles; } return Collections.emptyList(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 4a91d84225..53ff518323 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -40,9 +40,10 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec } public List download() throws IOException, InterruptedException, CellBaseException { - Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + + Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); Files.createDirectories(oboFolder); - logger.info(DOWNLOADING_LOG_MESSAGE, ONTOLOGY_NAME); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); @@ -50,33 +51,32 @@ public List download() throws IOException, InterruptedException, C // HPO downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(HPO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(HPO_OBO_VERSION_FILENAME)); + saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); downloadFiles.add(downloadFile); // GO downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(GO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(GO_OBO_VERSION_FILENAME)); + saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(GO_OBO_DATA))); downloadFiles.add(downloadFile); // DOID downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(DOID_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(DOID_OBO_VERSION_FILENAME)); + saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); downloadFiles.add(downloadFile); // Mondo downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(MONDO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(MONDO_OBO_VERSION_FILENAME)); + saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); - + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 873387f94b..2eeac8415f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -38,9 +38,9 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, PHARMACOGENOMICS_NAME); + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); - Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); Files.createDirectories(pharmgkbDownloadFolder); DownloadProperties.URLProperties pharmGKBProps = configuration.getDownload().getPharmGKB(); @@ -58,12 +58,11 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadFile); } - // Save versions - saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_NAME, pharmGKBProps.getVersion(), getTimeStamp(), urls, - pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PHARMACOGENOMICS_NAME); + // Save data source + saveDataSource(PHARMGKB_DATA, pharmGKBProps.getVersion(), getTimeStamp(), urls, + pharmgkbDownloadFolder.resolve(getDataVersionFilename(PHARMGKB_DATA))); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 5cb8a4c1f0..ba75a8e162 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -44,34 +44,30 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws CellBaseException if there is an error in the CelllBase configuration file */ public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, PROTEIN_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { - logger.info("{} not supported for the species {}", PROTEIN_NAME, speciesConfiguration.getScientificName()); + logger.info("{} not supported for the species {}", getDataName(PROTEIN_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); } - Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); + Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); Files.createDirectories(proteinFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Uniprot - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_NAME, PROTEIN_DATA, - UNIPROT_VERSION_FILENAME, proteinFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_DATA, proteinFolder); downloadFiles.add(downloadFile); // InterPro - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_NAME, PROTEIN_DATA, - INTERPRO_VERSION_FILENAME, proteinFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_DATA, proteinFolder); downloadFiles.add(downloadFile); // Intact - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_NAME, PROTEIN_DATA, - INTACT_VERSION_FILENAME, proteinFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_DATA, proteinFolder); downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PROTEIN_NAME); - + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 56d15bf844..0c87775f5c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -40,12 +40,13 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { + logger.info("{} not supported for the species {}", getDataName(REGULATION_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); } - regulationFolder = downloadFolder.resolve(REGULATION_SUBDIRECTORY); + regulationFolder = downloadFolder.resolve(REGULATION_DATA); Files.createDirectories(regulationFolder); - logger.info("Downloading {} files at {} ...", REGULATION_DATA, regulationFolder); List downloadFiles = new ArrayList<>(); @@ -53,6 +54,7 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadMiRTarBase()); downloadFiles.add(downloadMirna()); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); return downloadFiles; } @@ -62,19 +64,12 @@ public List download() throws IOException, InterruptedException, C * @throws InterruptedException Any issue downloading files */ private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException, CellBaseException { -// String baseUrl; -// if (configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { -// baseUrl = ensemblHostUrl + ensemblRelease + "/"; -// } else { -// baseUrl = ensemblHostUrl + ensemblRelease + "/" + getPhylo(speciesConfiguration) + "/"; -// } - DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Regulatory build downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_REGULATORY_BUILD_FILE_ID, - REGULATORY_BUILD_NAME, REGULATION_DATA, null, REGULATORY_BUILD_VERSION_FILENAME, regulationFolder); + REGULATORY_BUILD_DATA, regulationFolder); downloadFiles.add(downloadFile); // Motifs features @@ -89,21 +84,29 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // Save data source (name, category, version,...) - saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(" + ENSEMBL_NAME + " " + ensemblVersion + ")", getTimeStamp(), urls, - regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); + saveDataSource(MOTIF_FEATURES_DATA, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), urls, + regulationFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); return downloadFiles; } private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", MIRBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_NAME, REGULATION_DATA, - MIRBASE_VERSION_FILENAME, regulationFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRBASE_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_DATA, + regulationFolder); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRBASE_DATA)); + return downloadFile; } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", MIRTARBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, MIRTARBASE_NAME, REGULATION_DATA, - MIRTARBASE_VERSION_FILENAME, regulationFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, + MIRTARBASE_DATA, regulationFolder); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + return downloadFile; } } From 1586a77d87a96716bf6d6a2db2cd4713104f8474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 12:53:57 +0200 Subject: [PATCH 062/148] app: update load command executor according to the EtlCommons changes, #TASK-6142, #TASK-5564 --- .../app/cli/admin/executors/LoadCommandExecutor.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index c750beb6aa..166c4e7a6f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -44,7 +44,7 @@ import java.util.List; import java.util.concurrent.ExecutionException; -import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by imedina on 03/02/15. @@ -486,9 +486,9 @@ private void loadRepeats() { // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.TRF_VERSION_FILENAME), - input.resolve(EtlCommons.GSD_VERSION_FILENAME), - input.resolve(EtlCommons.WM_VERSION_FILENAME) + input.resolve(getDataVersionFilename(TRF_DATA)), + input.resolve(getDataVersionFilename(GSD_DATA)), + input.resolve(getDataVersionFilename(WM_DATA)) )); dataReleaseManager.update(dataRelease, "repeats", EtlCommons.REPEATS_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException @@ -587,7 +587,7 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { createIndex(EtlCommons.PHARMACOGENOMICS_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pharmaPath.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME)); + List sources = Collections.singletonList(pharmaPath.resolve(getDataVersionFilename(PHARMGKB_DATA))); dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); } From c7c398ab101ef654295acd01778cf3c33b135806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 12:54:46 +0200 Subject: [PATCH 063/148] lib: update CellBase builders according to the EtlCommons changes, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 59 +++++++++--------- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/builders/ConservationBuilder.java | 41 +++++++------ .../lib/builders/OntologyBuilder.java | 29 ++++----- .../lib/builders/PharmGKBBuilder.java | 14 +++-- .../cellbase/lib/builders/ProteinBuilder.java | 41 ++++++------- .../builders/RegulatoryFeatureBuilder.java | 21 ++++--- .../cellbase/lib/builders/RepeatsBuilder.java | 61 +++++++++++-------- 8 files changed, 141 insertions(+), 126 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index f1fdcbbb19..0cf6b17899 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -193,24 +193,24 @@ public void execute() throws CellBaseException { private CellBaseBuilder buildRepeats() throws CellBaseException { // Sanity check - Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_SUBDIRECTORY); - List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(TRF_VERSION_FILENAME), - repeatsDownloadPath.resolve(GSD_VERSION_FILENAME), - repeatsDownloadPath.resolve(WM_VERSION_FILENAME)); - copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_SUBDIRECTORY)); + Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); + List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(getDataVersionFilename(TRF_DATA)), + repeatsDownloadPath.resolve(getDataVersionFilename(GSD_DATA)), + repeatsDownloadPath.resolve(getDataVersionFilename(WM_DATA))); + copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_DATA)); // Create serializer and return the repeats builder - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_DATA), REPEATS_BASENAME); return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } private CellBaseBuilder buildObo() throws CellBaseException { - Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); - Path oboBuildPath = buildFolder.resolve(ONTOLOGY_SUBDIRECTORY); - List versionPaths = Arrays.asList(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), - oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), - oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), - oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME)); + Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_DATA); + Path oboBuildPath = buildFolder.resolve(ONTOLOGY_DATA); + List versionPaths = Arrays.asList(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA))); copyVersionFiles(versionPaths, oboBuildPath); // Create serializer and return the ontology builder @@ -234,14 +234,14 @@ private void copyVersionFiles(List pathList) { private CellBaseBuilder buildGenomeSequence() throws CellBaseException { // Sanity check - Path genomeVersionPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(GENOME_VERSION_FILENAME); - copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_SUBDIRECTORY)); + Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA)); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); // Get FASTA path Path fastaPath = getFastaReferenceGenome(); // Create serializer and return the genome builder - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_SUBDIRECTORY), GENOME_DATA); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA); return new GenomeSequenceFastaBuilder(fastaPath, serializer); } @@ -290,8 +290,8 @@ private CellBaseBuilder buildRegulation() throws CellBaseException { // Sanity check Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); - copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_VERSION_FILENAME), - regulationDownloadPath.resolve(MOTIF_FEATURES_VERSION_FILENAME)), regulationBuildPath); + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)), + regulationDownloadPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))), regulationBuildPath); // Create the file serializer and the regulatory feature builder CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); @@ -300,10 +300,10 @@ private CellBaseBuilder buildRegulation() throws CellBaseException { private CellBaseBuilder buildProtein() throws CellBaseException { // Sanity check - Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); - Path proteinBuildPath = buildFolder.resolve(PROTEIN_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(UNIPROT_VERSION_FILENAME), - proteinDownloadPath.resolve(INTERPRO_VERSION_FILENAME)), proteinBuildPath); + Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); + Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(getDataVersionFilename(UNIPROT_DATA)), + proteinDownloadPath.resolve(getDataVersionFilename(INTERPRO_DATA))), proteinBuildPath); // Create the file serializer and the protein builder CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); @@ -312,13 +312,14 @@ private CellBaseBuilder buildProtein() throws CellBaseException { private CellBaseBuilder buildConservation() throws CellBaseException { // Sanity check - Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_VERSION_FILENAME), - conservationDownloadPath.resolve(PHASTCONS_VERSION_FILENAME), conservationDownloadPath.resolve(PHYLOP_VERSION_FILENAME)), - buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); + Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } @@ -360,7 +361,7 @@ private Path getFastaReferenceGenome() throws CellBaseException { String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, getSpeciesShortname(speciesConfiguration), assembly.getName(), null); String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); - Path fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename); + Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); if (fastaPath.toFile().exists()) { // Gunzip logger.info("Gunzip file: {}", fastaPath); @@ -374,7 +375,7 @@ private Path getFastaReferenceGenome() throws CellBaseException { throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } } - fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename.replace(".gz", "")); + fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(".gz", "")); if (!fastaPath.toFile().exists()) { throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); } @@ -413,7 +414,7 @@ private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { // Sanity check Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); - copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME)), pharmGkbBuildPath); + copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA))), pharmGkbBuildPath); // Create the file serializer and the PharmGKB feature builder CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f2cc152005..a836288d6f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -186,6 +186,7 @@ public final class EtlCommons { // Repeats public static final String REPEATS_DATA = "repeats"; + public static final String REPEATS_BASENAME = "repeats"; /** * @deprecated (when refactoring downloaders, builders and loaders) */ diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 79099a4d93..d43c38cb7a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -18,6 +18,7 @@ import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; @@ -55,7 +56,7 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile @Override public void parse() throws IOException, CellBaseException { - logger.info(BUILDING_LOG_MESSAGE, CONSERVATION_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" @@ -63,25 +64,25 @@ public void parse() throws IOException, CellBaseException { } // Check GERP folder and files - Path gerpPath = conservedRegionPath.resolve(GERP_SUBDIRECTORY); - List gerpFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(GERP_VERSION_FILENAME).toFile()), gerpPath, - GERP_NAME); + Path gerpPath = conservedRegionPath.resolve(GERP_DATA); + DataSource dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); + List gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA)); // Check PhastCons folder and files - Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_SUBDIRECTORY); - List phastConsFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHASTCONS_VERSION_FILENAME).toFile()), - phastConsPath, PHASTCONS_NAME); + Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA); + dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); + List phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA)); // Check PhyloP folder and files - Path phylopPath = conservedRegionPath.resolve(PHYLOP_SUBDIRECTORY); - List phylopFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHYLOP_VERSION_FILENAME).toFile()), - phylopPath, PHYLOP_NAME); + Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA); + dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); + List phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA)); // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse // this file correctly, so we transform the file into a bedGraph format which is human-readable. if (gerpFiles.size() != 1) { - throw new CellBaseException("Only one " + GERP_NAME + " file is expected, but currently there are " + gerpFiles.size() - + " files"); + throw new CellBaseException("Only one " + getDataName(GERP_DATA) + " file is expected, but currently there are " + + gerpFiles.size() + " files"); } File bigwigFile = gerpFiles.get(0); File bedgraphFile = Paths.get(gerpFiles.get(0).getAbsolutePath() + ".bedgraph").toFile(); @@ -91,8 +92,8 @@ public void parse() throws IOException, CellBaseException { if (isExecutableAvailable(exec)) { EtlCommons.runCommandLineProcess(null, exec, Arrays.asList(bigwigFile.toString(), bedgraphFile.toString()), null); } else { - throw new CellBaseException(exec + " not found in your system, install it to build " + GERP_NAME + ". It is available" - + " at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); + throw new CellBaseException(exec + " not found in your system, install it to build " + getDataName(GERP_DATA) + + ". It is available at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); } } catch (IOException e) { throw new CellBaseException("Error executing " + exec + " in BIGWIG file " + bigwigFile, e); @@ -131,13 +132,13 @@ public void parse() throws IOException, CellBaseException { logger.debug("Chromosomes found '{}'", chromosomes); for (String chr : chromosomes) { logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHASTCONS_DATA)); - processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_NAME); + processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_DATA); logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); - processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_NAME); + processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA); } - logger.info(BUILDING_DONE_LOG_MESSAGE, CONSERVATION_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { @@ -156,8 +157,8 @@ private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseEx // Checking line if (fields.length != 4) { - throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + fields.length + " items: " - + line); + throw new CellBaseException("Invalid " + getDataName(GERP_DATA) + " line (expecting 4 columns): " + fields.length + + " items: " + line); } chromosome = fields[0]; @@ -263,7 +264,7 @@ private void storeScores(int startOfBatch, String chromosome, List conser } GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion<>(chromosome, startOfBatch, - startOfBatch + conservationScores.size() - 1, GERP_NAME, conservationScores); + startOfBatch + conservationScores.size() - 1, GERP_DATA, conservationScores); fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome)); // Reset diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 679e0d30f8..b14d20b54c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -42,34 +42,34 @@ public OntologyBuilder(Path oboDownloadPath, CellBaseSerializer serializer) { @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, ONTOLOGY_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); // Sanity check - checkDirectory(oboDownloadPath, REGULATION_NAME); + checkDirectory(oboDownloadPath, getDataName(REGULATION_DATA)); // Check ontology files - List hpoFiles = checkOboFiles(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), HPO_OBO_NAME); - List goFiles = checkOboFiles(oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), GO_OBO_NAME); - List doidFiles = checkOboFiles(oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), DOID_OBO_NAME); - List mondoFiles = checkOboFiles(oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME), MONDO_OBO_NAME); + List hpoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), getDataName(HPO_OBO_DATA)); + List goFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), getDataName(GO_OBO_DATA)); + List doidFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), getDataName(DOID_OBO_DATA)); + List mondoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA)), getDataName(MONDO_OBO_DATA)); // Parse OBO files and build - parseOboFile(hpoFiles.get(0), HPO_OBO_NAME); - parseOboFile(goFiles.get(0), GO_OBO_NAME); - parseOboFile(doidFiles.get(0), DOID_OBO_NAME); - parseOboFile(mondoFiles.get(0), MONDO_OBO_NAME); + parseOboFile(hpoFiles.get(0), HPO_OBO_DATA); + parseOboFile(goFiles.get(0), GO_OBO_DATA); + parseOboFile(doidFiles.get(0), DOID_OBO_DATA); + parseOboFile(mondoFiles.get(0), MONDO_OBO_DATA); // Close serializer serializer.close(); - logger.info(BUILDING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); } - private void parseOboFile(File oboFile, String name) throws IOException { + private void parseOboFile(File oboFile, String data) throws IOException { logger.info(PARSING_LOG_MESSAGE, oboFile); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(oboFile.toPath())) { OboParser parser = new OboParser(); - List terms = parser.parseOBO(bufferedReader, name); + List terms = parser.parseOBO(bufferedReader, data); for (OntologyTerm term : terms) { serializer.serialize(term); } @@ -78,7 +78,8 @@ private void parseOboFile(File oboFile, String name) throws IOException { } private List checkOboFiles(Path versionFilePath, String name) throws IOException, CellBaseException { - List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, ONTOLOGY_NAME + "/" + name); + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, getDataName(ONTOLOGY_DATA) + + "/" + name); if (files.size() != 1) { throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 0e6017fc01..1a0ba2e7d3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -24,6 +24,7 @@ import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.pharma.guideline.BasicObject; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; @@ -97,14 +98,15 @@ public PharmGKBBuilder(Path parmGkbDownloadPath, CellBaseFileSerializer serializ @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, PHARMGKB_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); // Sanity check - checkDirectory(pharmGkbDownloadPath, PHARMGKB_NAME); + checkDirectory(pharmGkbDownloadPath, getDataName(PHARMGKB_DATA)); // Check PharmGKB files - List pharmGkbFiles = checkFiles(dataSourceReader.readValue(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME).toFile()), - pharmGkbDownloadPath, PHARMACOGENOMICS_NAME + "/" + PHARMGKB_NAME); + DataSource dataSource = dataSourceReader.readValue(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA)).toFile()); + List pharmGkbFiles = checkFiles(dataSource, pharmGkbDownloadPath, getDataCategory(PHARMGKB_DATA) + "/" + + getDataName(PHARMGKB_DATA)); // Unzip downloaded file unzipDownloadedFiles(pharmGkbFiles); @@ -129,7 +131,7 @@ public void parse() throws Exception { } serializer.close(); - logger.info(BUILDING_DONE_LOG_MESSAGE, PHARMGKB_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); } private Map parseChemicalFile() throws IOException { @@ -152,7 +154,7 @@ private Map parseChemicalFile() throws IOException { // Label Has Dosing Info Has Rx Annotation RxNorm Identifiers ATC Identifiers PubChem Compound Identifiers PharmaChemical pharmaChemical = new PharmaChemical() .setId(fields[0]) - .setSource(PHARMGKB_NAME) + .setSource(PHARMGKB_DATA) .setName(fields[1]) .setSmiles(fields[7]) .setInChI(fields[8]); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index eb4c04a909..d8246241e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -22,6 +22,7 @@ import org.opencb.biodata.formats.protein.uniprot.UniProtParser; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.*; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; @@ -59,38 +60,30 @@ public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer seria @Override public void parse() throws CellBaseException, IOException { - logger.info(BUILDING_LOG_MESSAGE, PROTEIN_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); // Sanity check - if (proteinPath == null) { - throw new CellBaseException(PROTEIN_NAME + " directory is missing (null)"); - } - if (!Files.exists(proteinPath)) { - throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " does not exist"); - } - if (!Files.isDirectory(proteinPath)) { - throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " is not a directory"); - } + checkDirectory(proteinPath, getDataName(PROTEIN_DATA)); // Check UniProt file - List uniProtFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(UNIPROT_VERSION_FILENAME).toFile()), - proteinPath, PROTEIN_NAME + "/" + UNIPROT_NAME); + DataSource dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(UNIPROT_DATA)).toFile()); + List uniProtFiles = checkFiles(dataSource, proteinPath, getDataCategory(UNIPROT_DATA) + "/" + getDataName(UNIPROT_DATA)); if (uniProtFiles.size() != 1) { - throw new CellBaseException("Only one " + UNIPROT_NAME + " file is expected, but currently there are " + uniProtFiles.size() - + " files"); + throw new CellBaseException("Only one " + getDataName(UNIPROT_DATA) + " file is expected, but currently there are " + + uniProtFiles.size() + " files"); } // Check InterPro file - List interProFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(INTERPRO_VERSION_FILENAME).toFile()), - proteinPath, PROTEIN_NAME + "/" + INTERPRO_NAME); + dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(INTERPRO_DATA)).toFile()); + List interProFiles = checkFiles(dataSource, proteinPath, getDataCategory(INTERPRO_DATA) + "/" + getDataName(INTERPRO_DATA)); if (interProFiles.size() != 1) { - throw new CellBaseException("Only one " + INTERPRO_NAME + " file is expected, but currently there are " + uniProtFiles.size() - + " files"); + throw new CellBaseException("Only one " + getDataName(INTERPRO_DATA) + " file is expected, but currently there are " + + interProFiles.size() + " files"); } // Prepare UniProt data by splitting data in chunks Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - logger.info("Split {} file {} into chunks at {}", UNIPROT_NAME, uniProtFiles.get(0).getName(), uniProtChunksPath); + logger.info("Split {} file {} into chunks at {}", getDataName(UNIPROT_DATA), uniProtFiles.get(0).getName(), uniProtChunksPath); Files.createDirectories(uniProtChunksPath); splitUniprot(proteinPath.resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); @@ -182,13 +175,13 @@ public void parse() throws CellBaseException, IOException { } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, INTERPRO_NAME, - numUniqueProteinsProcessed); + logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, + getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); } } logger.info(PARSING_DONE_LOG_MESSAGE, interProFiles.get(0)); } catch (IOException e) { - throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interProFiles.get(0), e); + throw new CellBaseException("Error parsing " + getDataName(INTERPRO_DATA) + " file: " + interProFiles.get(0), e); } // Serialize and save results @@ -200,10 +193,10 @@ public void parse() throws CellBaseException, IOException { rocksDb.close(); } catch (JAXBException | RocksDBException | IOException e) { - throw new CellBaseException("Error parsing " + PROTEIN_NAME + " files", e); + throw new CellBaseException("Error parsing " + getDataName(PROTEIN_DATA) + " files", e); } - logger.info(BUILDING_DONE_LOG_MESSAGE, PROTEIN_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); } private RocksDB getDBConnection(Path uniProtChunksPath) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index c8067661dc..83eccb9885 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -24,6 +24,7 @@ import org.opencb.biodata.models.core.RegulatoryFeature; import org.opencb.biodata.models.core.RegulatoryPfm; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -54,24 +55,26 @@ public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializ @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, REGULATION_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA)); // Sanity check - checkDirectory(regulationPath, REGULATION_NAME); + checkDirectory(regulationPath, getDataName(REGULATION_DATA)); // Check build regulatory files - List regulatoryFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_VERSION_FILENAME) - .toFile()), regulationPath, REGULATION_NAME + "/" + REGULATORY_BUILD_NAME); + DataSource dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile()); + List regulatoryFiles = checkFiles(dataSource, regulationPath, getDataCategory(REGULATORY_BUILD_DATA) + "/" + + getDataName(REGULATORY_BUILD_DATA)); if (regulatoryFiles.size() != 1) { - throw new CellBaseException("One " + REGULATORY_BUILD_NAME + " file is expected, but currently there are " + throw new CellBaseException("One " + getDataName(REGULATORY_BUILD_DATA) + " file is expected, but currently there are " + regulatoryFiles.size() + " files"); } // Check motif features files - List motifFeaturesFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_VERSION_FILENAME) - .toFile()), regulationPath, REGULATION_NAME + "/" + MOTIF_FEATURES_NAME); + dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile()); + List motifFeaturesFiles = checkFiles(dataSource, regulationPath, getDataCategory(MOTIF_FEATURES_DATA) + "/" + + getDataName(MOTIF_FEATURES_DATA)); if (motifFeaturesFiles.size() != 2) { - throw new CellBaseException("Two " + MOTIF_FEATURES_NAME + " files are expected, but currently there are " + throw new CellBaseException("Two " + getDataName(MOTIF_FEATURES_DATA) + " files are expected, but currently there are " + motifFeaturesFiles.size() + " files"); } @@ -82,7 +85,7 @@ public void parse() throws Exception { // Parse regulatory build features parseGffFile(regulatoryFiles.get(0).toPath()); - logger.info(BUILDING_DONE_LOG_MESSAGE, REGULATION_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index 6cefc0266f..5ffabf747b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -20,8 +20,8 @@ import org.opencb.biodata.models.variant.avro.Repeat; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.ProgressLogger; import org.opencb.commons.utils.FileUtils; @@ -51,53 +51,56 @@ public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBase @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, EtlCommons.REPEATS_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + + // Sanity check + checkDirectory(filesDir, getDataName(REPEATS_DATA)); // Check Simple Repeats (TRF) filename String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() .toString(); if (!Files.exists(filesDir.resolve(trfFilename))) { - throw new CellBaseException(TRF_NAME + " file " + trfFilename + " does not exist at " + filesDir); + throw new CellBaseException(getMessageMissingFile(TRF_DATA, trfFilename, filesDir)); } // Check Genomic Super Duplications (GSD) file String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) .getFileName().toString(); if (!Files.exists(filesDir.resolve(gsdFilename))) { - throw new CellBaseException(GSD_NAME + " file " + gsdFilename + " does not exist at " + filesDir); + throw new CellBaseException(getMessageMissingFile(GSD_DATA, gsdFilename, filesDir)); } // Check Window Masker (WM) file String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() .toString(); if (!Files.exists(filesDir.resolve(wmFilename))) { - throw new CellBaseException(WM_NAME + " file " + wmFilename + " does not exist at " + filesDir); + throw new CellBaseException(getMessageMissingFile(WM_DATA, wmFilename, filesDir)); } // Parse TRF file - logger.info(BUILDING_LOG_MESSAGE, TRF_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(TRF_DATA)); parseTrfFile(filesDir.resolve(trfFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, TRF_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(TRF_DATA)); // Parse GSD file - logger.info(BUILDING_LOG_MESSAGE, GSD_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GSD_DATA)); parseGsdFile(filesDir.resolve(gsdFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, GSD_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GSD_DATA)); // Parse WM file - logger.info(BUILDING_LOG_MESSAGE, WM_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(WM_DATA)); parseWmFile(filesDir.resolve(wmFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, WM_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(WM_DATA)); - logger.info(BUILDING_DONE_LOG_MESSAGE, EtlCommons.REPEATS_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); } - private void parseTrfFile(Path filePath) throws IOException { + private void parseTrfFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed " + TRF_NAME + " lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(TRF_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseTrfLine(line)); line = bufferedReader.readLine(); @@ -111,15 +114,15 @@ private Repeat parseTrfLine(String line) { return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]), - Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_NAME); + Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_DATA); } - private void parseGsdFile(Path filePath) throws IOException { + private void parseGsdFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed " + GSD_NAME + " lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(GSD_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseGSDLine(line)); line = bufferedReader.readLine(); @@ -133,16 +136,16 @@ private Repeat parseGSDLine(String line) { return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null, - null, GSD_NAME); + null, GSD_DATA); } - private void parseWmFile(Path filePath) throws IOException { + private void parseWmFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed " + WM_NAME + " lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(WM_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseWmLine(line)); line = bufferedReader.readLine(); @@ -155,6 +158,16 @@ private Repeat parseWmLine(String line) { String[] parts = line.split("\t"); return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]), - Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_NAME); + Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_DATA); + } + + private String getMessageMissingFile(String data, String filename, Path folder) throws CellBaseException { + return getDataName(data) + " file " + filename + " does not exist at " + folder; } + + private String getMessageParsedLines(String data) throws CellBaseException { + return "Parsed " + getDataName(data) + " lines:"; + } + } + From 754384abae10c71cb156a4c0e2abd4688199ebf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 13:03:02 +0200 Subject: [PATCH 064/148] lib: fix revel builder, #TASK-5776, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/RevelScoreBuilder.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 68c6128f25..06f38f28f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -58,11 +58,11 @@ public void parse() throws IOException, CellBaseException { throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + revelFiles.size() + " files"); } - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelDownloadPath))); - ZipEntry zipEntry = zis.getNextEntry(); - logger.info(PARSING_LOG_MESSAGE, revelFiles.get(0)); + ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFiles.get(0)))); + ZipEntry zipEntry = zis.getNextEntry(); + ZipFile zipFile = new ZipFile(revelFiles.get(0).toString()); InputStream inputStream = zipFile.getInputStream(zipEntry); try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) { From 24eb0911b83638028d61675b98bedbc7a8eccf1a Mon Sep 17 00:00:00 2001 From: imedina Date: Tue, 7 May 2024 01:48:53 +0100 Subject: [PATCH 065/148] configuration: update versions --- .../app/scripts/ensembl-scripts/DB_CONFIG.pm | 8 +++--- .../src/main/resources/configuration.yml | 27 ++++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index 70865465e9..b0edf65793 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157"; our $ENSEMBL_GENOMES_USER = "anonymous"; ## Vertebrates -our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38"; -our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38"; -our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38"; -our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38"; +our $HOMO_SAPIENS_CORE = "homo_sapiens_core_111_38"; +our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_111_38"; +our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_111_38"; +our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_111_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index af817b1844..32a94fb765 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -73,7 +73,7 @@ download: host: ftp://ftp.ensemblgenomes.org/pub refSeq: host: https://ftp.ncbi.nih.gov/refseq/ - version: "October 16, 2023 (GRCh38.p14)" + version: "2023-10-11" files: GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz @@ -81,9 +81,9 @@ download: RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: host: https://ftp.ncbi.nlm.nih.gov/refseq/ - version: "1.1" + version: "1.2" files: - MANE_SELECT: MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz lrg: host: http://ftp.ebi.ac.uk/ version: "2021-03-30" @@ -91,9 +91,9 @@ download: LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt hgnc: host: https://ftp.ebi.ac.uk/ - version: "2023-11-01" + version: "2024-04-01" files: - HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt cancerHotspot: host: https://www.cancerhotspots.org/ version: "v2" @@ -106,7 +106,7 @@ download: DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: host: http://ftp.uniprot.org/ - version: "2024_01 (24-Jan-2024)" + version: "2024-03-27" files: UNIPROT_XREF: pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz geneExpressionAtlas: @@ -116,6 +116,7 @@ download: GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz hpo: ## NOTE: Download manually from here now + version: "2024-04-26" host: https://hpo.jax.org/app/data/annotations disgenet: host: https://www.disgenet.org/ @@ -149,12 +150,12 @@ download: ## Protein Data uniprot: host: https://ftp.uniprot.org/ - version: "2024-01-24" + version: "2024-03-27" files: UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: host: https://ftp.ebi.ac.uk/ - version: "2024-01-24" + version: "2024-03-27" files: INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz intact: @@ -180,7 +181,7 @@ download: host: http://ftp.ensembl.org/ version: "2023-05-17" files: - GERP: pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw ## Clinical Variant clinvar: @@ -204,11 +205,11 @@ download: files: HGMD: hgmd.vcf gwasCatalog: - ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20' + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' host: https://ftp.ebi.ac.uk/ - version: "2024-02-12" + version: "2024-04-22" files: - GWAS: pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz dgv: @@ -290,7 +291,7 @@ species: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '110_38' + - ensemblVersion: '111_38' name: GRCh38 - ensemblVersion: '82_37' name: GRCh37 From fc09da4b34d0865d418db6c688074e586a16dc33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 7 May 2024 11:17:17 +0200 Subject: [PATCH 066/148] app: add bash script to fix the downloaded MirTarBase file, #TASK-5775, #TASK-5564 --- .../app/scripts/mirtarbase/fix-gene-symbol.sh | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100755 cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh diff --git a/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh new file mode 100755 index 0000000000..38c7d1efa2 --- /dev/null +++ b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# The original MirTarBase hsa_MTI.xlsx contains invalid Gene Symbols in 793 lines. +# To fix it, that file has to be converted to a CSV file, i.e.: hsa_MTI.csv +# +# After converting to CSV file, we can see the errors from the original file for the Gene Symbols (column 4), +# e.g.: 06-mar: +# MIRT050267,hsa-miR-25-3p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# MIRT051174,hsa-miR-16-5p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# +# This script fix those lines and convert the column 4 for a vaild Gene Symbol: +# +# MIRT050267,hsa-miR-25-3p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# MIRT051174,hsa-miR-16-5p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 + +# Check the parameters number +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Check CSV file +csv_file="$1" +if [ ! -f "$csv_file" ]; then + echo "CSV file '$csv_file' does not exist." + exit 1 +fi + +# Fix gene-symbol +while IFS=$'\t' read -r c1 c2 c3 c4 c5 c6 c7 c8 c9 || [[ -n "$c1" ]]; do + # Aplica las condiciones + if [ "$c5" = "10299" ]; then + c4="MARCHF6" + elif [ "$c5" = "51257" ]; then + c4="MARCHF2" + elif [ "$c5" = "54708" ]; then + c4="MARCHF5" + elif [ "$c5" = "54996" ]; then + c4="MTARC2" + elif [ "$c5" = "55016" ]; then + c4="MARCHF1" + elif [ "$c5" = "57574" ]; then + c4="MARCHF4" + elif [ "$c5" = "64757" ]; then + c4="MTARC1" + elif [ "$c5" = "64844" ]; then + c4="MARCHF7" + elif [ "$c5" = "92979" ]; then + c4="MARCHF9" + elif [ "$c5" = "115123" ]; then + c4="MARCHF3" + elif [ "$c5" = "220972" ]; then + c4="MARCHF8" + elif [ "$c5" = "441061" ]; then + c4="MARCHF11" + fi + + # Print line + echo -e "$c1\t$c2\t$c3\t$c4\t$c5\t$c6\t$c7\t$c8\t$c9" +done < "$csv_file" From 09d33a0ef6eb858a64c7b5b6796ae098cb5d36f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 7 May 2024 11:19:29 +0200 Subject: [PATCH 067/148] core: add some comments to the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index af817b1844..a7b6a78f48 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -144,6 +144,8 @@ download: host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" files: + # This file contains errors and has to be fixed before building + # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx ## Protein Data From 303585debf0fc5225d0948b25f271ca6185e6c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 7 May 2024 11:30:01 +0200 Subject: [PATCH 068/148] lib: update Ensembl/RefSeq indexers and builders (include major improvements and sonnar fixes), #TASK-5776, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 17 +- .../cellbase/app/cli/admin/AdminMain.java | 1 + .../admin/executors/BuildCommandExecutor.java | 26 +- .../org/opencb/cellbase/lib/EtlCommons.java | 7 + .../lib/builders/CellBaseBuilder.java | 41 +- .../lib/builders/EnsemblGeneBuilder.java | 956 ++++++++++++++++++ .../builders/EnsemblGeneBuilderIndexer.java | 375 ++----- .../cellbase/lib/builders/GeneBuilder.java | 910 +---------------- .../lib/builders/GeneBuilderIndexer.java | 617 +++++++---- .../lib/builders/RefSeqGeneBuilder.java | 169 ++-- .../builders/RefSeqGeneBuilderIndexer.java | 261 +---- .../cellbase/lib/builders/RocksDbManager.java | 5 +- .../lib/builders/EnsemblGeneBuilderTest.java | 22 + .../lib/builders/GeneBuilderTest.java | 94 +- 14 files changed, 1706 insertions(+), 1795 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java create mode 100644 cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 55342641b3..1bda7d2793 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -24,6 +24,8 @@ import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -87,10 +89,10 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation_functional_score, missense_variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, ontology, pubmed and pharmacogenomics; or use 'all' to download everything", - required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA + + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, @@ -104,9 +106,10 @@ public class BuildCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " - + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + "," + + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," + + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to build everything", required = true, arity = 1) public String data; @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index d77722a492..fecf57c08a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -98,6 +98,7 @@ public static void main(String[] args) { commandExecutor.execute(); } catch (IOException | URISyntaxException | CellBaseException e) { commandExecutor.getLogger().error("Error: " + e.getMessage()); + e.printStackTrace(); System.exit(1); } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 0cf6b17899..081880ebe3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -64,9 +64,9 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; - private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, REFSEQ_DATA, - VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, - CLINICAL_VARIANTS_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -135,9 +135,6 @@ public void execute() throws CellBaseException { case GENE_DATA: parser = buildGene(); break; - case REFSEQ_DATA: - parser = buildRefSeq(); - break; case VARIATION_FUNCTIONAL_SCORE_DATA: parser = buildCadd(); break; @@ -246,22 +243,7 @@ private CellBaseBuilder buildGenomeSequence() throws CellBaseException { } private CellBaseBuilder buildGene() throws CellBaseException { - Path geneFolderPath = downloadFolder.resolve("gene"); - copyVersionFiles(Arrays.asList(geneFolderPath.resolve("dgidbVersion.json"), - geneFolderPath.resolve("ensemblCoreVersion.json"), geneFolderPath.resolve("uniprotXrefVersion.json"), - geneFolderPath.resolve("geneExpressionAtlasVersion.json"), - geneFolderPath.resolve("hpoVersion.json"), geneFolderPath.resolve("disgenetVersion.json"), - geneFolderPath.resolve("gnomadVersion.json"))); - Path genomeFastaFilePath = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "gene"); - return new GeneBuilder(geneFolderPath, genomeFastaFilePath, speciesConfiguration, flexibleGTFParsing, serializer); - } - - private CellBaseBuilder buildRefSeq() { - Path refseqFolderPath = downloadFolder.resolve("refseq"); - copyVersionFiles(Arrays.asList(refseqFolderPath.resolve("refSeqVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "refseq"); - return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); + return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing); } private CellBaseBuilder buildCadd() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index a836288d6f..e0a19c7114 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -44,6 +44,10 @@ */ public final class EtlCommons { + // Commons + public static final String XLSX_EXTENSION = ".xlsx"; + public static final String CSV_EXTENSION = ".csv"; + // Ensembl public static final String ENSEMBL_DATA = "ensembl"; public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; @@ -61,6 +65,7 @@ public final class EtlCommons { public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; + public static final String HSAPIENS_NAME= "hsapiens"; public static final String GRCH38_NAME = "GRCh38"; public static final String GRCH37_NAME = "GRCh37"; @@ -74,11 +79,13 @@ public final class EtlCommons { // Gene public static final String GENE_DATA = "gene"; + public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; public static final String GENE_ANNOTATION_DATA = "gene_annotation"; public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; // RefSeq public static final String REFSEQ_DATA = "refseq"; + public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; // Must match the configuration file public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index f5e79320d7..26fb2e838b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -18,6 +18,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -25,12 +26,15 @@ import org.slf4j.LoggerFactory; import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 30/08/14. */ @@ -39,33 +43,54 @@ public abstract class CellBaseBuilder { protected CellBaseSerializer serializer; protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + protected boolean checked; + protected Logger logger; + public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; + public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking done!"; + public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; - public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done."; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!"; public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; - public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building {}/{} done."; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done!"; public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; - public static final String PARSING_DONE_LOG_MESSAGE = "Parsing {} done."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!"; public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); this.serializer = serializer; - //this.serializer.open(); + this.checked = false; } public abstract void parse() throws Exception; public void disconnect() { - try { - serializer.close(); - } catch (Exception e) { - logger.error("Disconnecting serializer: " + e.getMessage()); + if (serializer != null) { + try { + serializer.close(); + } catch (Exception e) { + logger.error("Error closing serializer:\n" + StringUtils.join(e.getStackTrace(), "\n")); + } + } + } + + protected List checkFiles(String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + return checkFiles(getDataName(data), data, downloadPath, expectedFiles); + } + + protected List checkFiles(String label, String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + List files = checkFiles(dataSourceReader.readValue(downloadPath.resolve(getDataVersionFilename(data)).toFile()), + downloadPath, label); + if (files.size() != expectedFiles) { + throw new CellBaseException(expectedFiles + " " + label + " files are expected at " + downloadPath + ", but currently there" + + " are " + files.size() + " files"); } + return files; } protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java new file mode 100644 index 0000000000..a7e6b9f1cf --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -0,0 +1,956 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import htsjdk.tribble.readers.TabixReader; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.feature.gff.Gff2; +import org.opencb.biodata.formats.feature.gtf.Gtf; +import org.opencb.biodata.formats.feature.gtf.io.GtfReader; +import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.tools.sequence.FastaIndex; +import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class EnsemblGeneBuilder extends CellBaseBuilder { + + private Path downloadPath; + private SpeciesConfiguration speciesConfiguration; + private boolean flexibleGTFParsing; + private CellBaseConfiguration configuration; + + private Map transcriptDict; + private Map exonDict; + + private Path gtfFile; + private Path proteinFastaFile; + private Path cDnaFastaFile; + private Path geneDescriptionFile; + private Path xrefsFile; + private Path hgncFile; + private Path maneFile; + private Path lrgFile; + private Path uniprotIdMappingFile; + private Path tfbsFile; + private Path tabixFile; + private Path geneExpressionFile; + private Path geneDrugFile; + private Path hpoFile; + private Path disgenetFile; + private Path genomeSequenceFilePath; + private Path gnomadFile; + private Path geneOntologyAnnotationFile; + private Path miRBaseFile; + private Path miRTarBaseFile; + private Path cancerGeneCensusFile; + private Path cancerHostpotFile; + private Path ensemblCanonicalFile; + private Path tso500File; + private Path eglhHaemOncFile; + + // source for genes is either ensembl or refseq + private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); + + private int geneCounter; + private ArrayList geneList; + private String geneName; + private int transcriptCounter; + private ArrayList transcriptList; + private String transcriptName; + private int exonCounter; + private String feature; + private Gtf nextGtfToReturn; + + public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, + CellBaseSerializer serializer) { + super(serializer); + + this.downloadPath = downloadPath; + this.speciesConfiguration = speciesConfiguration; + this.flexibleGTFParsing = flexibleGTFParsing; + + transcriptDict = new HashMap<>(250000); + exonDict = new HashMap<>(8000000); + } + + public void check() throws Exception { + if (checked) { + return; + } + + String ensemblGeneLabel = getDataName(ENSEMBL_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); + + // Sanity check + checkDirectory(downloadPath, ensemblGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check Ensembl files + List files = checkFiles(ensemblGeneLabel, ENSEMBL_DATA, downloadPath, 3); + gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); + proteinFastaFile = files.stream().filter(f -> f.getName().contains(".pep.all.fa")).findFirst().get().toPath(); + cDnaFastaFile = files.stream().filter(f -> f.getName().contains(".cdna.all.fa")).findFirst().get().toPath(); + + // Check common files + // geneDescriptionFile = + // xrefsFile = + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // ensemblCanonicalFile = ; + // cancerGeneCensus = + // tso500File = + // eglhHaemOncFile = + + // Check regulation files + // Motif features + files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 2); + if (files.get(0).getName().endsWith("tbi")) { + tabixFile = files.get(0).toPath(); + tfbsFile = files.get(1).toPath(); + } else { + tabixFile = files.get(1).toPath(); + tfbsFile = files.get(0).toPath(); + } + // mirbase + miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + + // mirtarbase + // The downloaded .xlsx file contains errors and it has to be fixed manually + logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); + Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); + List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( + getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) + .collect(Collectors.toList()); + if (mirTarBaseFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath + + ", but currently there are " + mirTarBaseFiles.size() + " files"); + } + // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually + if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { + throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " + + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); + } + miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); + if (!Files.exists(miRTarBaseFile)) { + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + } + + // Check genome fasta file + genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath(); + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + checked = true; + } + + public void parse() throws Exception { + check(); + + Gene gene = null; + Transcript transcript; + Exon exon = null; + int cdna = 1; + int cds = 1; + + EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); + + try { + // process files and put values in rocksdb + indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, + proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, + geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, + miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, + tso500File, eglhHaemOncFile); + + TabixReader tabixReader = null; + if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { + logger.error("Tfbs or tabix file not found. Download them and try again."); + } else { + tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); + } + + // Preparing the fasta file for fast accessing +// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); + FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); + + // Empty transcript and exon dictionaries + transcriptDict.clear(); + exonDict.clear(); + + logger.info(PARSING_LOG_MESSAGE, gtfFile); + GtfReader gtfReader = new GtfReader(gtfFile); + + // Gene->Transcript->Feature->GTF line + Map>> gtfMap = null; + if (flexibleGTFParsing) { + gtfMap = loadGTFMap(gtfReader); + initializePointers(gtfMap); + } + + Gtf gtf; + while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { + + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + String geneId = gtf.getAttributes().get("gene_id"); + String transcriptId = gtf.getAttributes().get("transcript_id"); + String geneName = gtf.getAttributes().get("gene_name"); + if (newGene(gene, geneId)) { + // If new geneId is different from the current then we must serialize before data new gene + if (gene != null) { + serializer.serialize(gene); + } + + GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), + indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), + indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); + + gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), + gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), + new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); + } + + // Check if Transcript exist in the Gene Set of transcripts + if (!transcriptDict.containsKey(transcriptId)) { + transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); + } else { + transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); + } + + // At this point gene and transcript objects are set up + // Update gene and transcript genomic coordinates, start must be the + // lower, and end the higher + updateTranscriptAndGeneCoords(transcript, gene, gtf); + + String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; + if (gtf.getFeature().equalsIgnoreCase("exon")) { + // Obtaining the exon sequence + String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); + String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); + + exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf + .getAttributes().get("exon_number")), exonSequence); + transcript.getExons().add(exon); + + exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); + if (gtf.getAttributes().get("exon_number").equals("1")) { + cdna = 1; + cds = 1; + } else { + // with every exon we update cDNA length with the previous exon length + cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() + - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; + } + } else { + exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); + if (gtf.getFeature().equalsIgnoreCase("CDS")) { + // Protein ID is only present in CDS lines + String proteinId = gtf.getAttributes().get("protein_id") != null + ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") + : ""; + transcript.setProteinId(proteinId); + transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); + + if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + + // cDNA coordinates + exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + } + // strand - + } else { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + // cDNA coordinates + // cdnaCodingStart points to the same base position than genomicCodingEnd + exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + // cdnaCodingStart points to the same base position than genomicCodingEnd + transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + } + } + + } +// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { +// // nothing to do +// System.out.println("Empty block, this should be redesigned"); +// } + if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { + // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, + // no need to set it at the beginning of next feature + if (exon.getStrand().equals("+")) { + updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingEnd(gtf.getEnd()); + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + transcript.setCdsLength(cds - 1); + + } else { + updateNegativeExonCodingData(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + transcript.setCdsLength(cds - 1); + } + } + } + } + + // last gene must be serialized + serializer.serialize(gene); + + // Close + gtfReader.close(); + serializer.close(); + fastaIndex.close(); + indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); + } catch (Exception e) { + indexer.close(); + throw e; + } + } + + private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) + throws IOException, RocksDBException { + Map gtfAttributes = gtf.getAttributes(); + + // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. + String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); + String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; + String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); + List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); + + List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); + TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); + + Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", + 0, 0, 0, 0, 0, + indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", + gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, + new HashSet<>(), transcriptAnnotation); + + // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL + // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure + // that the xrefs array contains all ids present in the GTF file + addGtfXrefs(transcript, gene, gtfAttributes); + + // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID + String hgncId = indexer.getHgncId(gene.getName()); + if (StringUtils.isNotEmpty(hgncId)) { + transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); + } + + // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE + for (String suffix: Arrays.asList("refseq", "refseq_protein")) { + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); + if (StringUtils.isNotEmpty(maneRefSeq)) { + transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, + "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); + } + } + + // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG + String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(lrgRefSeq)) { + transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); + } + + // Add Flags + // 1. GTF tags + String tags = gtf.getAttributes().get("tag"); + if (StringUtils.isNotEmpty(tags)) { + transcript.getFlags().addAll(Arrays.asList(tags.split(","))); + } + // 2. TSL + String supportLevel = gtfAttributes.get("transcript_support_level"); + if (StringUtils.isNotEmpty(supportLevel)) { + // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" + String truncatedSupportLevel = supportLevel.split(" ")[0]; + transcript.getFlags().add("TSL:" + truncatedSupportLevel); + } + // 3. MANE Flag + String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); + if (StringUtils.isNotEmpty(maneFlag)) { + transcript.getFlags().add(maneFlag); + } + // 4. LRG Flag + String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); + if (StringUtils.isNotEmpty(lrg)) { + transcript.getFlags().add("LRG"); + } else { + for (Xref xref : transcript.getXrefs()) { + if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { + transcript.getFlags().add("LRG"); + } + } + } + // 5. Ensembl Canonical + String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); + if (StringUtils.isNotEmpty(canonicalFlag)) { + transcript.getFlags().add(canonicalFlag); + } + + // 6. TSO500 and EGLH HaemOnc + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(maneRefSeq)) { + String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); + if (StringUtils.isNotEmpty(tso500Flag)) { + transcript.getFlags().add(tso500Flag); + } + + String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); + if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { + transcript.getFlags().add(eglhHaemOncFlag); + } + } + + gene.getTranscripts().add(transcript); + + // Do not change order!! size()-1 is the index of the transcript ID + transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); + return transcript; + } + + private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) + throws IOException, RocksDBException { + if (xrefs == null || indexer == null) { + return null; + } + List annotations = new ArrayList<>(); + for (Xref xref : xrefs) { + if (xref.getDbName().equals("uniprotkb_acc")) { + String key = xref.getId(); + if (key != null && indexer.getOntologyAnnotations(key) != null) { + annotations.addAll(indexer.getOntologyAnnotations(key)); + } + } + } + return annotations; + } + + private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingEnd() == 0) { + exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingEnd(gtf.getEnd()); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingStart() == 0) { + exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { + if (transcript.getXrefs() == null) { + transcript.setXrefs(new ArrayList<>()); + } + + transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); + transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); + + // Some non-coding genes do not have Gene names + if (StringUtils.isNotEmpty(gene.getName())) { + transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); + transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); + } + + if (gtfAttributes.get("ccds_id") != null) { + transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); + } + } + + private void initializePointers(Map>> gtfMap) { + geneCounter = 0; + geneList = new ArrayList<>(gtfMap.keySet()); + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + + private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { + // Flexible parsing is deactivated, return next line + if (gtfMap == null) { + return gtfReader.read(); + // Flexible parsing activated, carefully select next line to return + } else { + // No more genes/features to return + if (nextGtfToReturn == null) { + return null; + } + Gtf gtfToReturn = nextGtfToReturn; + if (feature.equals("exon")) { +// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { + nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) + .get(transcriptName).get("exon")).get(exonCounter)).getStart(), + ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), + (List) gtfMap.get(geneName).get(transcriptName).get("cds")); + if (nextGtfToReturn != null) { + feature = "cds"; + return gtfToReturn; + } + } + // if no cds was found for this exon, get next exon + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("cds") || feature.equals("stop_codon")) { + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("start_codon")) { + feature = "stop_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); + return gtfToReturn; + } + // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon + throw new FileFormatException("Execution cannot reach this point"); + } + } + + private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { + for (Object cdsObject : cdsList) { + int cdsStart = ((Gtf) cdsObject).getStart(); + int cdsEnd = ((Gtf) cdsObject).getEnd(); + if (cdsStart <= exonEnd && cdsEnd >= exonStart) { + return (Gtf) cdsObject; + } + } + return null; + } + + private void getFeatureFollowsExon(Map>> gtfMap) { + exonCounter++; + if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() + || feature.equals("stop_codon")) { + // If last returned feature was a stop_codon or no start_codon is provided for this transcript, + // next transcript must be selected + if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { + feature = "start_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); + } else { + transcriptCounter++; + // No more transcripts in this gene, check if there are more genes + if (transcriptCounter == gtfMap.get(geneName).size()) { + geneCounter++; + // No more genes available, end parsing + if (geneCounter == gtfMap.size()) { + nextGtfToReturn = null; + feature = null; + // Still more genes to parse, select next one + } else { + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + } + } + // Check if a new gene was selected - null would indicate there're no more genes + if (nextGtfToReturn != null) { + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + } else { + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + + private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { + Map>> gtfMap = new HashMap<>(); + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene + String geneId = gtf.getAttributes().get("gene_id"); + // Transcript -> feature -> GTF line + Map> gtfMapGeneEntry; + if (gtfMap.containsKey(geneId)) { + gtfMapGeneEntry = gtfMap.get(geneId); + } else { + gtfMapGeneEntry = new HashMap(); + gtfMap.put(geneId, gtfMapGeneEntry); + } + + // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene + String transcriptId = gtf.getAttributes().get("transcript_id"); + Map gtfMapTranscriptEntry; + if (gtfMapGeneEntry.containsKey(transcriptId)) { + gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); + } else { + gtfMapTranscriptEntry = new HashMap(); + gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); + } + + addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); + + } + + // Exon number is mandatory for the parser to be able to properly generate the gene data model + if (!exonNumberPresent(gtfMap)) { + setExonNumber(gtfMap); + } + + return gtfMap; + } + + private boolean exonNumberPresent(Map>> gtfMap) { + Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); + return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) + .getAttributes().containsKey("exon_number"); + } + + private void setExonNumber(Map>> gtfMap) { + for (String gene : gtfMap.keySet()) { + for (String transcript : gtfMap.get(gene).keySet()) { + List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); + Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); + if (exonList.get(0).getStrand().equals("+")) { + int exonNumber = 1; + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber++; + } + } else { + int exonNumber = exonList.size(); + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber--; + } + } + } + } + } + + private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { + // Add exon/cds GTF line to the corresponding gene entry in the map + String featureType = gtf.getFeature().toLowerCase(); + if (featureType.equals("exon") || featureType.equals("cds")) { + List gtfList; + // Check if there were exons already stored + if (gtfMapTranscriptEntry.containsKey(featureType)) { + gtfList = (List) gtfMapTranscriptEntry.get(featureType); + } else { + gtfList = new ArrayList<>(); + gtfMapTranscriptEntry.put(featureType, gtfList); + } + gtfList.add(gtf); + // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" + // keys are already there + } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { + gtfMapTranscriptEntry.put(featureType, gtf); + } + } + + private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { + if (tabixReader == null) { + return null; + } + List transcriptTfbses = null; + + int transcriptStart = transcript.getStart(); + int transcriptEnd = transcript.getEnd(); + + + String line; + TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); + while ((line = iter.next()) != null) { + String[] elements = line.split("\t"); + + String sequenceName = elements[0]; + String source = elements[1]; + String feature = elements[2]; + int start = Integer.parseInt(elements[3]); + int end = Integer.parseInt(elements[4]); + String score = elements[5]; + String strand = elements[6]; + String frame = elements[7]; + String attribute = elements[8]; + + if (strand.equals(transcript.getStrand())) { + continue; + } + + if (transcript.getStrand().equals("+")) { + if (start > transcript.getStart() + 500) { + break; + } else if (end > transcript.getStart() - 2500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } else { + // transcript in negative strand + if (start > transcript.getEnd() + 2500) { + break; + } else if (start > transcript.getEnd() - 500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } + } + + return transcriptTfbses; + } + + protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, + List transcriptTfbses) { + if (transcriptTfbses == null) { + transcriptTfbses = new ArrayList<>(); + } + + // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; + // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB + String[] attributes = tfbs.getAttribute().split(";"); + + String id = null; + String pfmId = null; + List transciptionFactors = null; + + for (String attributePair : attributes) { + String[] attributePairArray = attributePair.split("="); + switch(attributePairArray[0]) { + case "binding_matrix_stable_id": + pfmId = attributePairArray[1]; + break; + case "stable_id": + id = attributePairArray[1]; + break; + case "transcription_factor_complex": + transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); + break; + default: + break; + } + } + + transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), + tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), + Float.parseFloat(tfbs.getScore()))); + return transcriptTfbses; + } + + private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { + Integer relativeStart; + if (transcript.getStrand().equals("+")) { + if (tfbs.getStart() < transcript.getStart()) { + relativeStart = tfbs.getStart() - transcript.getStart(); + } else { + relativeStart = tfbs.getStart() - transcript.getStart() + 1; + } + } else { + // negative strand transcript + if (tfbs.getEnd() > transcript.getEnd()) { + relativeStart = transcript.getEnd() - tfbs.getEnd(); + } else { + relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; + } + } + return relativeStart; + } + + private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { + Integer relativeEnd; + if (transcript.getStrand().equals("+")) { + if (tfbs.getEnd() < transcript.getStart()) { + relativeEnd = tfbs.getEnd() - transcript.getStart(); + } else { + relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; + } + } else { + if (tfbs.getStart() > transcript.getEnd()) { + relativeEnd = transcript.getEnd() - tfbs.getStart(); + } else { + relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; + } + } + return relativeEnd; + } + + + + private boolean newGene(Gene previousGene, String newGeneId) { + return previousGene == null || !newGeneId.equals(previousGene.getId()); + } + + private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { + if (transcript.getStart() > gtf.getStart()) { + transcript.setStart(gtf.getStart()); + } + if (transcript.getEnd() < gtf.getEnd()) { + transcript.setEnd(gtf.getEnd()); + } + if (gene.getStart() > gtf.getStart()) { + gene.setStart(gtf.getStart()); + } + if (gene.getEnd() < gtf.getEnd()) { + gene.setEnd(gtf.getEnd()); + } + } + + private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { + gtfFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { + proteinFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { + cDnaFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index fb67c19b8b..10f54e2ea1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -16,27 +16,44 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.core.JsonProcessingException; import org.apache.commons.lang3.StringUtils; -import org.apache.poi.hssf.usermodel.HSSFSheet; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParser; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParserCallback; import org.opencb.biodata.formats.gaf.GafParser; import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.FeatureOntologyTermAnnotation; +import org.opencb.biodata.models.core.MiRnaGene; +import org.opencb.biodata.models.core.MirnaTarget; import org.opencb.biodata.models.core.Xref; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.models.variant.avro.*; +import org.opencb.biodata.models.variant.avro.Constraint; +import org.opencb.biodata.models.variant.avro.Expression; +import org.opencb.biodata.models.variant.avro.ExpressionCall; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; +import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.zip.GZIPInputStream; -public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + +public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer { private static final String DESCRIPTION_SUFFIX = "_description"; private static final String XREF_SUFFIX = "_xref"; @@ -56,12 +73,12 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile) - throws IOException, RocksDBException, FileFormatException { - indexDescriptions(geneDescriptionFile); - indexXrefs(xrefsFile, uniprotIdMappingFile); + throws IOException, RocksDBException, FileFormatException, CellBaseException { +// indexDescriptions(geneDescriptionFile); +// indexXrefs(xrefsFile, uniprotIdMappingFile); indexHgncIdMapping(hgncFile); - indexManeMapping(maneFile, "ensembl"); - indexLrgMapping(lrgFile, "ensembl"); + indexManeMapping(maneFile, ENSEMBL_DATA); + indexLrgMapping(lrgFile, ENSEMBL_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexExpression(species, geneExpressionFile); @@ -69,13 +86,13 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexDiseases(hpoFile, disgenetFile); indexConstraints(gnomadFile); indexOntologyAnnotations(geneOntologyAnnotationFile); - indexMiRBase(miRBaseFile); + indexMiRBase(species, miRBaseFile); indexMiRTarBase(miRTarBaseFile); - indexCancerGeneCensus(cancerGeneGensusFile); +// indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); - indexCanonical(canonicalFile); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); +// indexCanonical(canonicalFile); +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { @@ -233,129 +250,6 @@ public List getExpression(String id) throws RocksDBException, IOExce return rocksDbManager.getExpression(rocksdb, key); } - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - String line; - - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - } - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Collections.singletonList(numberOfSNPs), Collections.singletonList(source), - "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - } - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { logger.info("Loading OE scores from '{}'", gnomadFile); @@ -384,7 +278,7 @@ private void indexConstraints(Path gnomadFile) throws IOException, RocksDBExcept rocksDbManager.update(rocksdb, transcriptIdentifier + CONSTRAINT_SUFFIX, constraints); if ("TRUE".equalsIgnoreCase(canonical)) { - rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); + rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); } } br.close(); @@ -432,66 +326,13 @@ public List getOntologyAnnotations(String id) thr return rocksDbManager.getOntologyAnnotations(rocksdb, key); } - private void indexMiRBase(Path miRBaseFile) throws IOException, RocksDBException { - if (miRBaseFile != null && Files.exists(miRBaseFile) && Files.size(miRBaseFile) > 0) { - logger.info("Loading mirna from '{}'", miRBaseFile); - FileInputStream fileInputStream = new FileInputStream(miRBaseFile.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - org.apache.poi.ss.usermodel.Cell cell = cellIterator.next(); - String miRBaseAccession = cell.getStringCellValue(); - - cell = cellIterator.next(); - String miRBaseID = cell.getStringCellValue(); - - cell = cellIterator.next(); - String status = cell.getStringCellValue(); - - cell = cellIterator.next(); - String sequence = cell.getStringCellValue(); + private void indexMiRBase(String species, Path miRBaseFile) throws IOException { + logger.info(PARSING_LOG_MESSAGE, miRBaseFile); - cell = cellIterator.next(); - String mature1Accession = cell.getStringCellValue(); + MirBaseCallback callback = new MirBaseCallback(rocksdb, rocksDbManager); + MirBaseParser.parse(miRBaseFile, species, callback); - cell = cellIterator.next(); - String mature1Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - String mature1Sequence = cell.getStringCellValue(); - - String mature2Accession = ""; - String mature2Id = ""; - String mature2Sequence = ""; - if (cellIterator.hasNext()) { - cell = cellIterator.next(); - mature2Accession = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Sequence = cell.getStringCellValue(); - } - - MiRnaGene miRNAGene = new MiRnaGene(miRBaseAccession, miRBaseID, status, sequence, new ArrayList<>()); - int cdnaStart = sequence.indexOf(mature1Sequence); - int cdnaEnd = cdnaStart + mature1Sequence.length(); - miRNAGene.addMiRNAMature(mature1Accession, mature1Id, mature1Sequence, cdnaStart, cdnaEnd); - - cdnaStart = sequence.indexOf(mature2Sequence); - cdnaEnd = cdnaStart + mature2Sequence.length(); - miRNAGene.addMiRNAMature(mature2Accession, mature2Id, mature2Sequence, cdnaStart, cdnaEnd); - - rocksDbManager.update(rocksdb, miRBaseID + MIRBASE_SUFFIX, miRNAGene); - } - } else { - logger.error("mirna file not found"); - } + logger.info(PARSING_DONE_LOG_MESSAGE, miRBaseFile); } public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOException { @@ -509,117 +350,11 @@ public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOEx return null; } - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList<>(); - Map> geneToMirna = new HashMap<>(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - - Iterator cellIterator = currentRow.iterator(); - Cell cell = cellIterator.next(); - - // Iterate columns - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // Skip species - cellIterator.next(); - - // Read target gene - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // Skip entrez gene - cellIterator.next(); - // Skip species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList<>(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmed - cell = cellIterator.next(); - String pubmed; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { -// pubmed = String.valueOf(cell.getNumericCellValue()); - pubmed = Integer.toString(Double.valueOf(cell.getNumericCellValue()).intValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } - } - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { String key = geneName + MIRTARBASE_SUFFIX; return rocksDbManager.getMirnaTargets(rocksdb, key); } - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - protected void indexCanonical(Path canonocalFile) throws IOException, RocksDBException { // Gene Transcript Canonical // ENSG00000210049.1 ENST00000387314.1 1 @@ -652,4 +387,30 @@ public String getCanonical(String transcriptId) throws RocksDBException, IOExcep } return new String(bytes); } + + // Implementation of the MirBaseParserCallback function + public class MirBaseCallback implements MirBaseParserCallback { + + private RocksDB rocksDB; + private RocksDbManager rocksDbManager; + private Logger logger; + + public MirBaseCallback(RocksDB rocksDB, RocksDbManager rocksDbManager) { + this.rocksDB = rocksDB; + this.rocksDbManager = rocksDbManager; + this.logger = LoggerFactory.getLogger(this.getClass()); + } + + @Override + public boolean processMiRnaGene(MiRnaGene miRnaGene) { + try { + rocksDbManager.update(rocksdb, miRnaGene.getId() + MIRBASE_SUFFIX, miRnaGene); + } catch (JsonProcessingException | RocksDBException e) { + logger.warn("Something wrong happened when processing miRNA gene {}: {}", miRnaGene.getId(), + StringUtils.join(e.getStackTrace(), "\t")); + return false; + } + return true; + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index cd0863a259..970f73e05a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -16,904 +16,54 @@ package org.opencb.cellbase.lib.builders; -import htsjdk.tribble.readers.TabixReader; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gtf.Gtf; -import org.opencb.biodata.formats.feature.gtf.io.GtfReader; -import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.tools.sequence.FastaIndex; -import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.rocksdb.RocksDBException; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class GeneBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private Map transcriptDict; - private Map exonDict; +public class GeneBuilder extends CellBaseBuilder { - private Path gtfFile; - private Path proteinFastaFile; - private Path cDnaFastaFile; - private Path geneDescriptionFile; - private Path xrefsFile; - private Path hgncFile; - private Path maneFile; - private Path lrgFile; - private Path uniprotIdMappingFile; - private Path tfbsFile; - private Path tabixFile; - private Path geneExpressionFile; - private Path geneDrugFile; - private Path hpoFile; - private Path disgenetFile; - private Path genomeSequenceFilePath; - private Path gnomadFile; - private Path geneOntologyAnnotationFile; - private Path miRBaseFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHostpotFile; - private Path ensemblCanonicalFile; - private Path tso500File; - private Path eglhHaemOncFile; - private boolean flexibleGTFParsing; + private EnsemblGeneBuilder ensemblGeneBuilder; + private RefSeqGeneBuilder refSeqGeneBuilder; - // source for genes is either ensembl or refseq - private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); - private SpeciesConfiguration speciesConfiguration; + public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing) + throws CellBaseException { + super(null); - private int geneCounter; - private ArrayList geneList; - private String geneName; - private int transcriptCounter; - private ArrayList transcriptList; - private String transcriptName; - private int exonCounter; - private String feature; - private Gtf nextGtfToReturn; + // Create Ensembl gene builder + CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA), + ENSEMBL_GENE_BASENAME); + this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, + ensemblGeneSerializer); - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - CellBaseSerializer serializer) throws CellBaseException { - this(geneDirectoryPath, genomeSequenceFastaFile, speciesConfiguration, false, serializer); + // Create RefSeq gene builder + CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA), + REFSEQ_GENE_BASENAME); + this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, refSeqGeneSerializer); } - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException { - this(null, geneDirectoryPath.resolve("description.txt"), - geneDirectoryPath.resolve("xrefs.txt"), - geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"), - geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"), - geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"), - geneDirectoryPath.resolve("idmapping_selected.tab.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz.tbi"), - geneDirectoryPath.resolve("allgenes_updown_in_organism_part.tab.gz"), - geneDirectoryPath.resolve("dgidb.tsv"), - geneDirectoryPath.resolve("phenotype_to_genes.txt"), - geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"), - geneDirectoryPath.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz"), - geneDirectoryPath.resolve("goa_human.gaf.gz"), - geneDirectoryPath.getParent().resolve("regulation/miRNA.xls"), - geneDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"), - geneDirectoryPath.resolve("cancer-gene-census.tsv"), - geneDirectoryPath.resolve("hotspots_v2.xls"), - geneDirectoryPath.resolve("ensembl_canonical.txt"), - geneDirectoryPath.resolve("TSO500_transcripts.txt"), - geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"), - genomeSequenceFastaFile, - speciesConfiguration, flexibleGTFParsing, serializer); + public void check() throws Exception { + // Check Ensembl requirements + ensemblGeneBuilder.check(); - getGtfFileFromGeneDirectoryPath(geneDirectoryPath); - getProteinFastaFileFromGeneDirectoryPath(geneDirectoryPath); - getCDnaFastaFileFromGeneDirectoryPath(geneDirectoryPath); - } - - public GeneBuilder(Path gtfFile, Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, - Path lrgFile, Path uniprotIdMappingFile, Path tfbsFile, Path tabixFile, Path geneExpressionFile, - Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, - Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneCensusFile, - Path cancerHostpotFile, Path ensemblCanonicalFile, Path tso500File, Path eglhHaemOncFile, - Path genomeSequenceFilePath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseSerializer serializer) { - super(serializer); - - this.gtfFile = gtfFile; - this.geneDescriptionFile = geneDescriptionFile; - this.xrefsFile = xrefsFile; - this.hgncFile = hgncFile; - this.maneFile = maneFile; - this.lrgFile = lrgFile; - this.uniprotIdMappingFile = uniprotIdMappingFile; - this.tfbsFile = tfbsFile; - this.tabixFile = tabixFile; - this.geneExpressionFile = geneExpressionFile; - this.geneDrugFile = geneDrugFile; - this.hpoFile = hpoFile; - this.disgenetFile = disgenetFile; - this.gnomadFile = gnomadFile; - this.geneOntologyAnnotationFile = geneOntologyAnnotationFile; - this.miRBaseFile = miRBaseFile; - this.miRTarBaseFile = miRTarBaseFile; - this.cancerGeneCensusFile = cancerGeneCensusFile; - this.cancerHostpotFile = cancerHostpotFile; - this.ensemblCanonicalFile = ensemblCanonicalFile; - this.tso500File = tso500File; - this.eglhHaemOncFile = eglhHaemOncFile; - this.genomeSequenceFilePath = genomeSequenceFilePath; - this.speciesConfiguration = speciesConfiguration; - this.flexibleGTFParsing = flexibleGTFParsing; - - transcriptDict = new HashMap<>(250000); - exonDict = new HashMap<>(8000000); + // Check RefSeq requirements + refSeqGeneBuilder.check(); } + @Override public void parse() throws Exception { - Gene gene = null; - Transcript transcript; - Exon exon = null; - int cdna = 1; - int cds = 1; - EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(gtfFile.getParent()); - - try { - // process files and put values in rocksdb - indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, - proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, - geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, - tso500File, eglhHaemOncFile); - - TabixReader tabixReader = null; - if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { - logger.error("Tfbs or tabix file not found. Download them and try again."); - } else { - tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); - } - - // Preparing the fasta file for fast accessing -// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); - FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); - - // Empty transcript and exon dictionaries - transcriptDict.clear(); - exonDict.clear(); - logger.info("Parsing gtf..."); - GtfReader gtfReader = new GtfReader(gtfFile); - - // Gene->Transcript->Feature->GTF line - Map>> gtfMap = null; - if (flexibleGTFParsing) { - gtfMap = loadGTFMap(gtfReader); - initializePointers(gtfMap); - } - - Gtf gtf; - while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { - - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - String geneId = gtf.getAttributes().get("gene_id"); - String transcriptId = gtf.getAttributes().get("transcript_id"); - String geneName = gtf.getAttributes().get("gene_name"); - if (newGene(gene, geneId)) { - // If new geneId is different from the current then we must serialize before data new gene - if (gene != null) { - serializer.serialize(gene); - } - - GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), - indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), - indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); - - gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), - gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), - new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); - } - - // Check if Transcript exist in the Gene Set of transcripts - if (!transcriptDict.containsKey(transcriptId)) { - transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); - } else { - transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); - } - - // At this point gene and transcript objects are set up - // Update gene and transcript genomic coordinates, start must be the - // lower, and end the higher - updateTranscriptAndGeneCoords(transcript, gene, gtf); - - String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; - if (gtf.getFeature().equalsIgnoreCase("exon")) { - // Obtaining the exon sequence - String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); - String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); - - exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf - .getAttributes().get("exon_number")), exonSequence); - transcript.getExons().add(exon); - - exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); - if (gtf.getAttributes().get("exon_number").equals("1")) { - cdna = 1; - cds = 1; - } else { - // with every exon we update cDNA length with the previous exon length - cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() - - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; - } - } else { - exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); - if (gtf.getFeature().equalsIgnoreCase("CDS")) { - // Protein ID is only present in CDS lines - String proteinId = gtf.getAttributes().get("protein_id") != null - ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") - : ""; - transcript.setProteinId(proteinId); - transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); - - if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - - // cDNA coordinates - exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - } - // strand - - } else { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - // cDNA coordinates - // cdnaCodingStart points to the same base position than genomicCodingEnd - exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - // cdnaCodingStart points to the same base position than genomicCodingEnd - transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - } - } - - } -// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { -// // nothing to do -// System.out.println("Empty block, this should be redesigned"); -// } - if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { - // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, - // no need to set it at the beginning of next feature - if (exon.getStrand().equals("+")) { - updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingEnd(gtf.getEnd()); - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - transcript.setCdsLength(cds - 1); - - } else { - updateNegativeExonCodingData(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - transcript.setCdsLength(cds - 1); - } - } - } - } - - // last gene must be serialized - serializer.serialize(gene); - - // cleaning - gtfReader.close(); - serializer.close(); - fastaIndex.close(); - indexer.close(); - } catch (Exception e) { - indexer.close(); - throw e; - } - } - - private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) - throws IOException, RocksDBException { - Map gtfAttributes = gtf.getAttributes(); - - // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. - String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); - String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; - String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); - List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); - - List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); - TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); - - Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", - 0, 0, 0, 0, 0, - indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", - gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, - new HashSet<>(), transcriptAnnotation); - - // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL - // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure - // that the xrefs array contains all ids present in the GTF file - addGtfXrefs(transcript, gene, gtfAttributes); - - // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID - String hgncId = indexer.getHgncId(gene.getName()); - if (StringUtils.isNotEmpty(hgncId)) { - transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); - } - - // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("refseq", "refseq_protein")) { - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); - if (StringUtils.isNotEmpty(maneRefSeq)) { - transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, - "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); - } - } - - // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(lrgRefSeq)) { - transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); - } - - // Add Flags - // 1. GTF tags - String tags = gtf.getAttributes().get("tag"); - if (StringUtils.isNotEmpty(tags)) { - transcript.getFlags().addAll(Arrays.asList(tags.split(","))); - } - // 2. TSL - String supportLevel = gtfAttributes.get("transcript_support_level"); - if (StringUtils.isNotEmpty(supportLevel)) { - // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" - String truncatedSupportLevel = supportLevel.split(" ")[0]; - transcript.getFlags().add("TSL:" + truncatedSupportLevel); - } - // 3. MANE Flag - String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); - if (StringUtils.isNotEmpty(maneFlag)) { - transcript.getFlags().add(maneFlag); - } - // 4. LRG Flag - String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); - if (StringUtils.isNotEmpty(lrg)) { - transcript.getFlags().add("LRG"); - } else { - for (Xref xref : transcript.getXrefs()) { - if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { - transcript.getFlags().add("LRG"); - } - } - } - // 5. Ensembl Canonical - String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); - if (StringUtils.isNotEmpty(canonicalFlag)) { - transcript.getFlags().add(canonicalFlag); - } - - // 6. TSO500 and EGLH HaemOnc - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(maneRefSeq)) { - String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } - } - - gene.getTranscripts().add(transcript); - - // Do not change order!! size()-1 is the index of the transcript ID - transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); - return transcript; - } - - private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) - throws IOException, RocksDBException { - if (xrefs == null || indexer == null) { - return null; - } - List annotations = new ArrayList<>(); - for (Xref xref : xrefs) { - if (xref.getDbName().equals("uniprotkb_acc")) { - String key = xref.getId(); - if (key != null && indexer.getOntologyAnnotations(key) != null) { - annotations.addAll(indexer.getOntologyAnnotations(key)); - } - } - } - return annotations; - } - - private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingEnd() == 0) { - exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); - } - } - - private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingEnd(gtf.getEnd()); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingStart() == 0) { - exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); - } - } - - private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { - if (transcript.getXrefs() == null) { - transcript.setXrefs(new ArrayList<>()); - } - - transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); - transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); - - // Some non-coding genes do not have Gene names - if (StringUtils.isNotEmpty(gene.getName())) { - transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); - transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); - } - - if (gtfAttributes.get("ccds_id") != null) { - transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); - } - } - - private void initializePointers(Map>> gtfMap) { - geneCounter = 0; - geneList = new ArrayList<>(gtfMap.keySet()); - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - - private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { - // Flexible parsing is deactivated, return next line - if (gtfMap == null) { - return gtfReader.read(); - // Flexible parsing activated, carefully select next line to return - } else { - // No more genes/features to return - if (nextGtfToReturn == null) { - return null; - } - Gtf gtfToReturn = nextGtfToReturn; - if (feature.equals("exon")) { -// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { - nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) - .get(transcriptName).get("exon")).get(exonCounter)).getStart(), - ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), - (List) gtfMap.get(geneName).get(transcriptName).get("cds")); - if (nextGtfToReturn != null) { - feature = "cds"; - return gtfToReturn; - } - } - // if no cds was found for this exon, get next exon - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("cds") || feature.equals("stop_codon")) { - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("start_codon")) { - feature = "stop_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); - return gtfToReturn; - } - // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon - throw new FileFormatException("Execution cannot reach this point"); - } - } - - private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { - for (Object cdsObject : cdsList) { - int cdsStart = ((Gtf) cdsObject).getStart(); - int cdsEnd = ((Gtf) cdsObject).getEnd(); - if (cdsStart <= exonEnd && cdsEnd >= exonStart) { - return (Gtf) cdsObject; - } - } - return null; - } - - private void getFeatureFollowsExon(Map>> gtfMap) { - exonCounter++; - if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() - || feature.equals("stop_codon")) { - // If last returned feature was a stop_codon or no start_codon is provided for this transcript, - // next transcript must be selected - if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { - feature = "start_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); - } else { - transcriptCounter++; - // No more transcripts in this gene, check if there are more genes - if (transcriptCounter == gtfMap.get(geneName).size()) { - geneCounter++; - // No more genes available, end parsing - if (geneCounter == gtfMap.size()) { - nextGtfToReturn = null; - feature = null; - // Still more genes to parse, select next one - } else { - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - } - } - // Check if a new gene was selected - null would indicate there're no more genes - if (nextGtfToReturn != null) { - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - } else { - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - - private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { - Map>> gtfMap = new HashMap<>(); - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene - String geneId = gtf.getAttributes().get("gene_id"); - // Transcript -> feature -> GTF line - Map> gtfMapGeneEntry; - if (gtfMap.containsKey(geneId)) { - gtfMapGeneEntry = gtfMap.get(geneId); - } else { - gtfMapGeneEntry = new HashMap(); - gtfMap.put(geneId, gtfMapGeneEntry); - } - - // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene - String transcriptId = gtf.getAttributes().get("transcript_id"); - Map gtfMapTranscriptEntry; - if (gtfMapGeneEntry.containsKey(transcriptId)) { - gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); - } else { - gtfMapTranscriptEntry = new HashMap(); - gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); - } - - addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); - - } - - // Exon number is mandatory for the parser to be able to properly generate the gene data model - if (!exonNumberPresent(gtfMap)) { - setExonNumber(gtfMap); - } - - return gtfMap; - } - - private boolean exonNumberPresent(Map>> gtfMap) { - Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); - return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) - .getAttributes().containsKey("exon_number"); - } - - private void setExonNumber(Map>> gtfMap) { - for (String gene : gtfMap.keySet()) { - for (String transcript : gtfMap.get(gene).keySet()) { - List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); - Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); - if (exonList.get(0).getStrand().equals("+")) { - int exonNumber = 1; - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber++; - } - } else { - int exonNumber = exonList.size(); - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber--; - } - } - } - } - } - - private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { - // Add exon/cds GTF line to the corresponding gene entry in the map - String featureType = gtf.getFeature().toLowerCase(); - if (featureType.equals("exon") || featureType.equals("cds")) { - List gtfList; - // Check if there were exons already stored - if (gtfMapTranscriptEntry.containsKey(featureType)) { - gtfList = (List) gtfMapTranscriptEntry.get(featureType); - } else { - gtfList = new ArrayList<>(); - gtfMapTranscriptEntry.put(featureType, gtfList); - } - gtfList.add(gtf); - // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" - // keys are already there - } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { - gtfMapTranscriptEntry.put(featureType, gtf); - } - } + // Check folders and files before building + check(); - private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { - if (tabixReader == null) { - return null; - } - List transcriptTfbses = null; - - int transcriptStart = transcript.getStart(); - int transcriptEnd = transcript.getEnd(); - - - String line; - TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); - while ((line = iter.next()) != null) { - String[] elements = line.split("\t"); - - String sequenceName = elements[0]; - String source = elements[1]; - String feature = elements[2]; - int start = Integer.parseInt(elements[3]); - int end = Integer.parseInt(elements[4]); - String score = elements[5]; - String strand = elements[6]; - String frame = elements[7]; - String attribute = elements[8]; - - if (strand.equals(transcript.getStrand())) { - continue; - } - - if (transcript.getStrand().equals("+")) { - if (start > transcript.getStart() + 500) { - break; - } else if (end > transcript.getStart() - 2500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } else { - // transcript in negative strand - if (start > transcript.getEnd() + 2500) { - break; - } else if (start > transcript.getEnd() - 500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } - } - - return transcriptTfbses; - } - - protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, - List transcriptTfbses) { - if (transcriptTfbses == null) { - transcriptTfbses = new ArrayList<>(); - } - - // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; - // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB - String[] attributes = tfbs.getAttribute().split(";"); - - String id = null; - String pfmId = null; - List transciptionFactors = null; - - for (String attributePair : attributes) { - String[] attributePairArray = attributePair.split("="); - switch(attributePairArray[0]) { - case "binding_matrix_stable_id": - pfmId = attributePairArray[1]; - break; - case "stable_id": - id = attributePairArray[1]; - break; - case "transcription_factor_complex": - transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); - break; - default: - break; - } - } - - transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), - tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), - Float.parseFloat(tfbs.getScore()))); - return transcriptTfbses; - } - - private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { - Integer relativeStart; - if (transcript.getStrand().equals("+")) { - if (tfbs.getStart() < transcript.getStart()) { - relativeStart = tfbs.getStart() - transcript.getStart(); - } else { - relativeStart = tfbs.getStart() - transcript.getStart() + 1; - } - } else { - // negative strand transcript - if (tfbs.getEnd() > transcript.getEnd()) { - relativeStart = transcript.getEnd() - tfbs.getEnd(); - } else { - relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; - } - } - return relativeStart; - } - - private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { - Integer relativeEnd; - if (transcript.getStrand().equals("+")) { - if (tfbs.getEnd() < transcript.getStart()) { - relativeEnd = tfbs.getEnd() - transcript.getStart(); - } else { - relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; - } - } else { - if (tfbs.getStart() > transcript.getEnd()) { - relativeEnd = transcript.getEnd() - tfbs.getStart(); - } else { - relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; - } - } - return relativeEnd; - } - - - - private boolean newGene(Gene previousGene, String newGeneId) { - return previousGene == null || !newGeneId.equals(previousGene.getId()); - } - - private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { - if (transcript.getStart() > gtf.getStart()) { - transcript.setStart(gtf.getStart()); - } - if (transcript.getEnd() < gtf.getEnd()) { - transcript.setEnd(gtf.getEnd()); - } - if (gene.getStart() > gtf.getStart()) { - gene.setStart(gtf.getStart()); - } - if (gene.getEnd() < gtf.getEnd()) { - gene.setEnd(gtf.getEnd()); - } - } - - private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } - - private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { - proteinFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } + // Build Ensembl/RefSeq genes + ensemblGeneBuilder.parse(); + refSeqGeneBuilder.parse(); - private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { - cDnaFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 285236ba60..b8941cc448 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -24,9 +24,10 @@ import org.opencb.biodata.formats.sequence.fasta.Fasta; import org.opencb.biodata.formats.sequence.fasta.io.FastaReader; import org.opencb.biodata.models.clinical.ClinicalProperty; -import org.opencb.biodata.models.core.CancerHotspot; -import org.opencb.biodata.models.core.CancerHotspotVariant; -import org.opencb.biodata.models.core.GeneCancerAssociation; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; +import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; @@ -42,8 +43,14 @@ import java.util.*; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + public class GeneBuilderIndexer { + public static final String ROCKSDB_FOLDER = "rocksdb.idx"; + protected RocksDB rocksdb; protected RocksDbManager rocksDbManager; protected Logger logger; @@ -69,7 +76,7 @@ public GeneBuilderIndexer(Path genePath) { private void init(Path genePath) { rocksDbManager = new RocksDbManager(); - dbLocation = genePath.resolve("integration.idx").toString(); + dbLocation = genePath.resolve(ROCKSDB_FOLDER).toString(); rocksdb = rocksDbManager.getDBConnection(dbLocation); dbOption = new Options().setCreateIfMissing(true); @@ -77,18 +84,14 @@ private void init(Path genePath) { } protected void indexCdnaSequences(Path cDnaFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading RefSeq's cDNA sequences..."); - FileUtils.checkPath(cDnaFastaFile); - if (Files.size(cDnaFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(cDnaFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); - } - fastaReader.close(); - } else { - logger.warn("RefSeq's cDNA sequences not loaded"); + logger.info(PARSING_LOG_MESSAGE, cDnaFastaFile); + FastaReader fastaReader = new FastaReader(cDnaFastaFile); + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); } + fastaReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, cDnaFastaFile); } public String getCdnaFasta(String id) throws RocksDBException { @@ -96,18 +99,14 @@ public String getCdnaFasta(String id) throws RocksDBException { } protected void indexProteinSequences(Path proteinFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading ENSEMBL's protein sequences..."); - FileUtils.checkPath(proteinFastaFile); - if (Files.size(proteinFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(proteinFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); - } - fastaReader.close(); - } else { - logger.warn("ENSEMBL's protein sequences not loaded"); + logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); + FastaReader fastaReader = new FastaReader(proteinFastaFile); + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); } + fastaReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, proteinFastaFile); } protected String getProteinFasta(String id) throws RocksDBException { @@ -115,22 +114,18 @@ protected String getProteinFasta(String id) throws RocksDBException { } protected void indexHgncIdMapping(Path hgncMappingFile) throws IOException, RocksDBException { - // #hgnc_id symbol name locus_group locus_type status location location_sortable ... - logger.info("Indexing HGNC ID mapping data ..."); - - // We only need the first two columns: hgnc_id -> symbol - if (hgncMappingFile != null && Files.exists(hgncMappingFile) && Files.size(hgncMappingFile) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); - line = bufferedReader.readLine(); - } + logger.info(PARSING_LOG_MESSAGE, hgncMappingFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { + String line = bufferedReader.readLine(); + // We only need the first two columns: hgnc_id -> symbol + // #hgnc_id symbol name locus_group locus_type status location location_sortable ... + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); + line = bufferedReader.readLine(); } - } else { - logger.warn("HGNC ID mapping file " + hgncMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, hgncMappingFile); } public String getHgncId(String id) throws RocksDBException { @@ -138,29 +133,25 @@ public String getHgncId(String id) throws RocksDBException { } protected void indexManeMapping(Path maneMappingFile, String referenceId) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, maneMappingFile); + int idColumn = referenceId.equalsIgnoreCase(ENSEMBL_DATA) ? 7 : 5; + // #NCBI_GeneID Ensembl_Gene HGNC_ID symbol name RefSeq_nuc RefSeq_prot Ensembl_nuc Ensembl_prot // MANE_status GRCh38_chr chr_start chr_end chr_strand - logger.info("Indexing MANE mapping data ..."); - - if (maneMappingFile != null && Files.exists(maneMappingFile) && Files.size(maneMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 7 : 5; -// BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile); - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); - line = bufferedReader.readLine(); - } + line = bufferedReader.readLine(); } - } else { - logger.warn("MANE mapping file " + maneMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, maneMappingFile); } public String getMane(String id, String field) throws RocksDBException { @@ -168,30 +159,27 @@ public String getMane(String id, String field) throws RocksDBException { } protected void indexLrgMapping(Path lrgMappingFile, String referenceId) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, lrgMappingFile); + // # Last modified: 30-03-2021@22:00:06 // # LRG HGNC_SYMBOL REFSEQ_GENOMIC LRG_TRANSCRIPT REFSEQ_TRANSCRIPT ENSEMBL_TRANSCRIPT CCDS // LRG_1 COL1A1 NG_007400.1 t1 NM_000088.3 ENST00000225964.10 CCDS11561.1 - logger.info("Indexing LRG mapping data ..."); - - if (lrgMappingFile != null && Files.exists(lrgMappingFile) && Files.size(lrgMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - String id = fields[idColumn]; - if (StringUtils.isNotEmpty(id) && !id.equals("-")) { - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); - } + int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + String id = fields[idColumn]; + if (StringUtils.isNotEmpty(id) && !id.equals("-")) { + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("LRG mapping file " + lrgMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, lrgMappingFile); } public String getLrg(String id, String field) throws RocksDBException { @@ -199,6 +187,8 @@ public String getLrg(String id, String field) throws RocksDBException { } protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, cgcFile); + Map tissuesMap = new HashMap<>(); tissuesMap.put("E", "epithelial"); tissuesMap.put("L", "leukaemia/lymphoma"); @@ -224,10 +214,8 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx mutationTypesMap.put("Mis", "missense"); mutationTypesMap.put("PromoterMis", "missense"); - logger.info("Indexing CANCER GENE CENSUS data ..."); - if (cgcFile != null && Files.exists(cgcFile) && Files.size(cgcFile) > 0) { + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile)) { // Skip the first header line - BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile); bufferedReader.readLine(); GeneCancerAssociation cancerGeneAssociation; @@ -237,9 +225,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx // Find Ensembl Gene Id in the last comma-separated column List synonyms = StringUtils.isNotEmpty(fields[19]) ? Arrays.stream(fields[19] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -264,44 +252,44 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx : Collections.emptyList(); List tissues = StringUtils.isNotEmpty(fields[12]) ? Arrays.stream(fields[12] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(tissuesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List modeOfInheritance = StringUtils.isNotEmpty(fields[13]) ? fields[13].equalsIgnoreCase("Dom/Rec") - ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) - : Collections.singletonList(moiMap.get(fields[13])) + ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) + : Collections.singletonList(moiMap.get(fields[13])) : Collections.emptyList(); List roleInCancer = StringUtils.isNotEmpty(fields[14]) ? Arrays.stream(fields[14] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(roleInCancerMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List mutationTypes = StringUtils.isNotEmpty(fields[15]) ? Arrays.stream(fields[15] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(mutationTypesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List translocationPartners = StringUtils.isNotEmpty(fields[16]) ? Arrays.stream(fields[16] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); List otherSyndromes = StringUtils.isNotEmpty(fields[18]) ? Arrays.stream(fields[18] - .replaceAll("\"", "") - .split("; ")) + .replaceAll("\"", "") + .split("; ")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -312,10 +300,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx rocksDbManager.update(rocksdb, fields[0] + CANCER_GENE_CENSUS_SUFFIX, cancerGeneAssociation); } } - bufferedReader.close(); - } else { - logger.warn("CANCER GENE CENSUS file " + cgcFile + " not found"); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cgcFile); } public List getCancerGeneCensus(String geneName) throws RocksDBException, IOException { @@ -324,97 +311,102 @@ public List getCancerGeneCensus(String geneName) throws R } public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, cancerHotspot); + // Store all cancer hotspot (different gene and aminoacid position) for each gene in the same key Map> visited = new HashMap<>(); - FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - iterator.next(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - String geneName = currentRow.getCell(0).toString(); - - if (currentRow.getCell(1).toString().contains("splice")) { - continue; - } - int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - - CancerHotspot ch = null; - // Check if ch object already exist - if (visited.containsKey(geneName)) { - for (CancerHotspot hotspot : visited.get(geneName)) { - if (hotspot.getAminoacidPosition() == aminoAcidPosition) { - ch = hotspot; - break; - } - } - } - // If not exist we create new ch - if (ch == null) { - ch = new CancerHotspot(); - ch.setScores(new HashMap<>()); - ch.setCancerTypeCount(new HashMap<>()); - ch.setOrganCount(new HashMap<>()); - ch.setVariants(new ArrayList<>()); - - // Parse new row - ch.setGeneName(geneName); - ch.setAminoacidPosition(aminoAcidPosition); - ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); - ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); - - String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); - for (String cancerCount : cancerCountSplit) { - String[] split = cancerCount.split(":"); - ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + try (FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile())) { + HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); + HSSFSheet sheet = workbook.getSheetAt(0); + Iterator iterator = sheet.iterator(); + iterator.next(); + while (iterator.hasNext()) { + Row currentRow = iterator.next(); + String geneName = currentRow.getCell(0).toString(); + + if (currentRow.getCell(1).toString().contains("splice")) { + continue; } + int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); - for (String organCount : organCountSplit) { - String[] split = organCount.split(":"); - ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + CancerHotspot ch = null; + // Check if ch object already exist + if (visited.containsKey(geneName)) { + for (CancerHotspot hotspot : visited.get(geneName)) { + if (hotspot.getAminoacidPosition() == aminoAcidPosition) { + ch = hotspot; + break; + } + } } - ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); - ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); - ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); - ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); - ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); - ch.setAminoacidReference(currentRow.getCell(35).toString()); - ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); - ch.setCancerType(currentRow.getCell(37).toString()); + // If not exist we create new ch + if (ch == null) { + ch = new CancerHotspot(); + ch.setScores(new HashMap<>()); + ch.setCancerTypeCount(new HashMap<>()); + ch.setOrganCount(new HashMap<>()); + ch.setVariants(new ArrayList<>()); + + // Parse new row + ch.setGeneName(geneName); + ch.setAminoacidPosition(aminoAcidPosition); + ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); + ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); + + String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); + for (String cancerCount : cancerCountSplit) { + String[] split = cancerCount.split(":"); + ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + } - if (visited.containsKey(geneName)) { - // Gene exists but no this aminoacid position - visited.get(geneName).add(ch); - } else { - // New gene found - visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); + for (String organCount : organCountSplit) { + String[] split = organCount.split(":"); + ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + } + + ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); + ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); + ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); + ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); + ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); + ch.setAminoacidReference(currentRow.getCell(35).toString()); + ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); + ch.setCancerType(currentRow.getCell(37).toString()); + + if (visited.containsKey(geneName)) { + // Gene exists but no this aminoacid position + visited.get(geneName).add(ch); + } else { + // New gene found + visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + } } - } - // Add cancer hotspot variant information - CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); - cancerHotspotVariant.setSampleCount(new HashMap<>()); + // Add cancer hotspot variant information + CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); + cancerHotspotVariant.setSampleCount(new HashMap<>()); - String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); - cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); - cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); + String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); + cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); + cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); - String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); - for (String sampleCount : sampleSplit) { - String[] sampleCountSplit = sampleCount.split(":"); - cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); + for (String sampleCount : sampleSplit) { + String[] sampleCountSplit = sampleCount.split(":"); + cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + } + ch.getVariants().add(cancerHotspotVariant); } - ch.getVariants().add(cancerHotspotVariant); } - fileInputStream.close(); for (String geneName : visited.keySet()) { rocksDbManager.update(rocksdb, geneName + CANCER_HOTSPOT_SUFFIX, visited.get(geneName)); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cancerHotspot); } public List getCancerHotspot(String geneName) throws RocksDBException, IOException { @@ -422,29 +414,25 @@ public List getCancerHotspot(String geneName) throws RocksDBExcep return rocksDbManager.getCancerHotspot(rocksdb, key); } - protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { - // Gene Ref Seq - // FAS NM_000043 - // AR NM_000044 - logger.info("Indexing TSO500 data ..."); - - if (tso500Path != null && Files.exists(tso500Path) && Files.size(tso500Path) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); - } + logger.info(PARSING_LOG_MESSAGE, tso500Path); + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { + String line = bufferedReader.readLine(); + // Gene Ref Seq + // FAS NM_000043 + // AR NM_000044 + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + if (fields.length == 2) { + rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("Ensembl TSO500 mapping file " + tso500Path + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, tso500Path); } public String getTSO500(String transcriptId) throws RocksDBException { @@ -456,29 +444,25 @@ public String getTSO500(String transcriptId) throws RocksDBException { return new String(bytes); } - protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { - // Gene Ref Seq - // GNB1 NM_002074.4 - // CSF3R NM_000760.3 - logger.info("Indexing EGLH HaemOnc data ..."); - - if (eglhHaemOncPath != null && Files.exists(eglhHaemOncPath) && Files.size(eglhHaemOncPath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); - } + logger.info(PARSING_LOG_MESSAGE, eglhHaemOncPath); + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { + String line = bufferedReader.readLine(); + // Gene Ref Seq + // GNB1 NM_002074.4 + // CSF3R NM_000760.3 + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + if (fields.length == 2) { + rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("Ensembl EGLH HaemOnc mapping file " + eglhHaemOncPath + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, eglhHaemOncPath); } public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { @@ -510,4 +494,219 @@ protected void close() throws IOException { rocksDbManager.closeIndex(rocksdb, dbOption, dbLocation); } + protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, geneDrugFile); + + String currentGene = ""; + List drugs = new ArrayList<>(); + + try (BufferedReader br = FileUtils.newBufferedReader(geneDrugFile)) { + // Skip header + br.readLine(); + + int lineCounter = 1; + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split("\t"); + String geneName = parts[0]; + if (currentGene.equals("")) { + currentGene = geneName; + } else if (!currentGene.equals(geneName)) { + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + drugs = new ArrayList<>(); + currentGene = geneName; + } + + String source = null; + if (parts.length >= 4) { + source = parts[3]; + } + + String interactionType = null; + if (parts.length >= 5) { + interactionType = parts[4]; + } + + String drugName = null; + if (parts.length >= 8) { + // if drug name column is empty, use drug claim name instead + drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; + } + if (StringUtils.isEmpty(drugName)) { + // no drug name + continue; + } + + String chemblId = null; + if (parts.length >= 9) { + chemblId = parts[8]; + } + + List publications = new ArrayList<>(); + if (parts.length >= 10 && parts[9] != null) { + publications = Arrays.asList(parts[9].split(",")); + } + + GeneDrugInteraction drug = new GeneDrugInteraction( + geneName, drugName, source, null, null, interactionType, chemblId, publications); + drugs.add(drug); + lineCounter++; + } + } + // update last gene + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + + logger.info(PARSING_DONE_LOG_MESSAGE, geneDrugFile); + } + + protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { + + Map> geneDiseaseAssociationMap = new HashMap<>(50000); + + String line; + + // HPO +// logger.info(PARSING_LOG_MESSAGE, hpoFilePath); +// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { +// // Skip first header line +// bufferedReader.readLine(); +// while ((line = bufferedReader.readLine()) != null) { +// String[] fields = line.split("\t"); +// String omimId = fields[6]; +// String geneSymbol = fields[3]; +// String hpoId = fields[0]; +// String diseaseName = fields[1]; +// GeneTraitAssociation disease = +// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DATA); +// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); +// } +// } +// logger.info(PARSING_DONE_LOG_MESSAGE, hpoFilePath); + + // DisGeNet + logger.info(PARSING_LOG_MESSAGE, disgenetFilePath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { + // Skip first header line + bufferedReader.readLine(); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); + String diseaseId = fields[4]; + String diseaseName = fields[5]; + String score = fields[9]; + String numberOfPubmeds = fields[13].trim(); + String numberOfSNPs = fields[14]; + String source = fields[15]; + GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), + Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), DISGENET_DATA); + addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, disgenetFilePath); + + for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); + } + } + + protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile); + + try (BufferedReader reader = Files.newBufferedReader(miRTarBaseFile)) { + String line; + // Skip header line + reader.readLine(); + + String currentMiRTarBaseId = null; + String currentMiRNA = null; + String currentGene = null; + List targetGenes = new ArrayList<>(); + Map> geneToMirna = new HashMap<>(); + + while ((line = reader.readLine()) != null) { + String[] field = line.split("\t", -1); + if (field.length != 9) { + throw new CellBaseException("Invalid number of columns " + field.length + " (expected 9 columns) parsing file " + + miRTarBaseFile + ". Line: " + line); + } + + // #0: miRTarBase ID + String miRTarBaseId = field[0]; + if (currentMiRTarBaseId == null) { + currentMiRTarBaseId = miRTarBaseId; + } + + // #1: miRNA + String miRNA = field[1]; + if (currentMiRNA == null) { + currentMiRNA = miRNA; + } + + // #2: Species (miRNA) + + // #3: Target Gene + String geneName = field[3]; + if (currentGene == null) { + currentGene = geneName; + } + + // #4: Target Gene (Entrez ID) + // #5: Species (Target Gene) + + if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { + // new entry, store current one + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); + addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + targetGenes = new ArrayList<>(); + currentGene = geneName; + currentMiRTarBaseId = miRTarBaseId; + currentMiRNA = miRNA; + } + + // #6: Experiments + String experiment = field[6]; + + // #7: Support Type + String supportType = field[7]; + + // #8: pubmed + String pubmed = field[8]; + + targetGenes.add(new TargetGene(experiment, supportType, pubmed)); + } + + // parse last entry + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + + for (Map.Entry> entry : geneToMirna.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile); + } + + protected static void addValueToMapElement(Map> map, String key, T value) { + if (map.containsKey(key)) { + map.get(key).add(value); + } else { + List valueList = new ArrayList<>(); + valueList.add(value); + map.put(key, valueList); + } + } + + protected List getDrugs(String id) throws RocksDBException, IOException { + String key = id + DRUGS_SUFFIX; + return rocksDbManager.getDrugs(rocksdb, key); + } + + protected List getDiseases(String id) throws RocksDBException, IOException { + String key = id + DISEASE_SUFFIX; + return rocksDbManager.getDiseases(rocksdb, key); + } + + protected List getMirnaTargets(String geneName) throws RocksDBException, IOException { + String key = geneName + MIRTARBASE_SUFFIX; + return rocksDbManager.getMirnaTargets(rocksdb, key); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 48b0cd1d0d..56e1edd6ff 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -24,26 +24,43 @@ import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.rocksdb.RocksDBException; +import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class RefSeqGeneBuilder extends CellBaseBuilder { + private Path downloadPath; + private Map transcriptDict; private Map exonDict; private Path gtfFile; private Path fastaFile; - private Path proteinFastaFile, cdnaFastaFile; - private Path maneFile, lrgFile, disgenetFile, hpoFile, geneDrugFile, miRTarBaseFile; - private Path cancerGeneCensus, cancerHotspot; - private Path tso500File, eglhHaemOncFile; + private Path proteinFastaFile; + private Path cdnaFastaFile; + private Path maneFile; + private Path lrgFile; + private Path disgenetFile; + private Path hpoFile; + private Path geneDrugFile; + private Path miRTarBaseFile; + private Path cancerGeneCensus; + private Path cancerHotspot; + private Path tso500File; + private Path eglhHaemOncFile; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); - private final String status = "KNOWN"; + private static final String KNOWN_STATUS = "KNOWN"; private static final String SOURCE = ParamConstants.QueryParams.REFSEQ.key(); private Gene gene = null; private Transcript transcript = null; @@ -52,85 +69,95 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; - - public RefSeqGeneBuilder(Path refSeqDirectoryPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { super(serializer); + this.downloadPath = downloadPath; this.speciesConfiguration = speciesConfiguration; - getGtfFileFromDirectoryPath(refSeqDirectoryPath); - getFastaFileFromDirectoryPath(refSeqDirectoryPath); - getProteinFastaFileFromDirectoryPath(refSeqDirectoryPath); - getCdnaFastaFileFromDirectoryPath(refSeqDirectoryPath); - setAnnotationFiles(refSeqDirectoryPath); - transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); } - private void setAnnotationFiles(Path refSeqDirectoryPath) { - Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); - maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); - lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); - geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); - disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); - hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); - cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); - cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); - tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); - eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); - miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); - } - - private void getGtfFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = refSeqDirectoryPath.resolve(fileName); - break; - } + public void check() throws Exception { + if (checked) { + return; } - } - private void getFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("genomic.fna") || fileName.endsWith("genomic.fna.gz")) { - fastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } - } - } + String refSeqGeneLabel = getDataName(REFSEQ_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); - private void getProteinFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".faa") || fileName.endsWith(".faa.gz")) { - proteinFastaFile = refSeqDirectoryPath.resolve(fileName); - break; + // Sanity check + checkDirectory(downloadPath, refSeqGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); } } - } - private void getCdnaFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("cdna.fna") || fileName.endsWith("cdna.fna.gz")) { - cdnaFastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } - } + // Check RefSeq files + List files = checkFiles(refSeqGeneLabel, REFSEQ_DATA, downloadPath, 4); + gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); + proteinFastaFile = files.stream().filter(f -> f.getName().contains("_protein")).findFirst().get().toPath(); + cdnaFastaFile = files.stream().filter(f -> f.getName().contains("_rna")).findFirst().get().toPath(); + fastaFile = files.stream().filter(f -> f.getName().contains("_genomic.fna")).findFirst().get().toPath(); + + // Check common files + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // cancerGeneCensus = ; + // tso500File = ; + // eglhHaemOncFile = ; + + // Check regulation files + // mirtarbase + // The downloaded .xlsx file contains errors and it has to be fixed manually + logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); + Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); + List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( + getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) + .collect(Collectors.toList()); + if (mirTarBaseFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath + + ", but currently there are " + mirTarBaseFiles.size() + " files"); + } + // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually + if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { + throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " + + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); + } + miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); + if (!Files.exists(miRTarBaseFile)) { + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + checked = true; } public void parse() throws Exception { + check(); + // Preparing the fasta file for fast accessing FastaIndex fastaIndex = null; if (fastaFile != null) { fastaIndex = new FastaIndex(fastaFile); } - // index protein sequences for later + // Index protein sequences for later + logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, cancerGeneCensus, cancerHotspot, tso500File, eglhHaemOncFile); + logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); - logger.info("Parsing RefSeq gtf..."); + logger.info(PARSING_LOG_MESSAGE, gtfFile); GtfReader gtfReader = new GtfReader(gtfFile); Gtf gtf; @@ -164,22 +191,24 @@ public void parse() throws Exception { } } - // add xrefs to last transcript + // Add xrefs to last transcript addXrefs(transcript, geneDbxrefs, exonDbxrefs); - // last gene must be serialized + // Last gene must be serialized store(); - // cleaning + // Close gtfReader.close(); serializer.close(); if (fastaIndex != null) { fastaIndex.close(); } indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); } - // store right before parsing the previous gene, or the very last gene. + // Store right before parsing the previous gene, or the very last gene. private void store() { serializer.serialize(gene); reset(); @@ -235,7 +264,7 @@ private void parseGene(Gtf gtf, String chromosome, RefSeqGeneBuilderIndexer inde null, indexer.getMirnaTargets(geneName), indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); gene = new Gene(geneId, geneName, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), "1", geneBiotype, - status, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); + KNOWN_STATUS, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); geneDbxrefs = parseXrefs(gtf); } @@ -567,7 +596,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId if ("mRNA".equals(biotype)) { biotype = "protein_coding"; } - transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, status, + transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, KNOWN_STATUS, 0, 0, 0, 0, 0, indexer.getCdnaFasta(transcriptId), "", "", "", version, SOURCE, new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new HashSet<>(), new TranscriptAnnotation()); @@ -644,6 +673,20 @@ private String getSequenceName(String fullSequenceName) { return fullSequenceName; } +// private void setAnnotationFiles(Path refSeqDirectoryPath) { +// Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); +// maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); +// lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); +// geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); +// disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); +// hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); +// cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); +// cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); +// tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); +// eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); +// miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); +// } + static { REFSEQ_CHROMOSOMES.put("NC_000001", "1"); REFSEQ_CHROMOSOMES.put("NC_000002", "2"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 45520161f5..9aae170ce2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -16,25 +16,16 @@ package org.opencb.cellbase.lib.builders; -import org.apache.commons.lang.StringUtils; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.MirnaTarget; -import org.opencb.biodata.models.core.TargetGene; -import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; -import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; -import org.opencb.commons.utils.FileUtils; +import org.opencb.cellbase.core.exception.CellBaseException; import org.rocksdb.RocksDBException; -import java.io.BufferedReader; -import java.io.FileInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.REFSEQ_DATA; + +public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer { public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { super(refSeqDirectoryPath); @@ -42,249 +33,17 @@ public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot, Path tso500File, - Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException { - indexManeMapping(maneFile, "refseq"); - indexLrgMapping(lrgFile, "refseq"); + Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { + indexManeMapping(maneFile, REFSEQ_DATA); + indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexDrugs(geneDrugFile); indexDiseases(hpoFilePath, disgenetFile); indexMiRTarBase(miRTarBaseFile); - indexCancerGeneCensus(cancerGeneGensus); +// indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); - } - - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - - String line; - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - bufferedReader.close(); - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - bufferedReader.close(); - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList(); - Map> geneToMirna = new HashMap(); - while (iterator.hasNext()) { - - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - Cell cell = cellIterator.next(); - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // species - cellIterator.next(); - - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // entrez - cellIterator.next(); - // species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmeds - cell = cellIterator.next(); - String pubmed = null; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { - pubmed = String.valueOf(cell.getNumericCellValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } - - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { - String key = geneName + MIRTARBASE_SUFFIX; - return rocksDbManager.getMirnaTargets(rocksdb, key); - } - - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java index cf8351cc54..3a178b4828 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java @@ -60,8 +60,11 @@ public RocksDB getDBConnection(String dbLocation) { Options options = new Options().setCreateIfMissing(true); RocksDB db = null; try { + if (!Files.exists(Paths.get(dbLocation))) { + Files.createDirectories(Paths.get(dbLocation)); + } return RocksDB.open(options, dbLocation); - } catch (RocksDBException e) { + } catch (RocksDBException | IOException e) { // do some error handling e.printStackTrace(); System.exit(1); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java new file mode 100644 index 0000000000..63d1f445a8 --- /dev/null +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java @@ -0,0 +1,22 @@ +package org.opencb.cellbase.lib.builders; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.SpeciesConfiguration; + +import java.nio.file.Path; +import java.nio.file.Paths; + +class EnsemblGeneBuilderTest { + + public void testGeneBuilder() throws Exception { + Path downloadPath = Paths.get("/home/jtarraga/data/cellbase/cb6/v6.1.0-dr1/homo_sapiens_grch38/download/gene"); + Path buildPath = Paths.get("/home/jtarraga/data/cellbase/cb6/v6.1.0-dr1/homo_sapiens_grch38/generated_json/gene"); + boolean flexibleGTFParsing = false; + CellBaseConfiguration configuration = CellBaseConfiguration.load(Paths.get("/home/jtarraga/appl/cellbase/build/conf/configuration.yml")); + SpeciesConfiguration speciesConfiguration = configuration.getSpeciesConfig("hsapiens"); + + GeneBuilder geneBuilder = new GeneBuilder(downloadPath, buildPath, speciesConfiguration, flexibleGTFParsing); + geneBuilder.check(); + geneBuilder.parse(); + } +} \ No newline at end of file diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java index 5926c0184b..798c1a29db 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java @@ -55,23 +55,23 @@ public GeneBuilderTest() { @BeforeAll public void init() { - try { - Path genomeSequenceFastaFile - = Paths.get(GeneBuilderTest.class.getResource("/gene/Homo_sapiens.GRCh38.fa").toURI()); - Path geneDirectoryPath = Paths.get(GeneBuilderTest.class.getResource("/gene").toURI()); - // put the results in /tmp - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gene", - true); - SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", - "human", null, null, null); - geneParser = new GeneBuilder(geneDirectoryPath, genomeSequenceFastaFile, species, serializer); - jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); - jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); - geneParser.parse(); - } catch (Exception e) { - e.printStackTrace(); - } +// try { +// Path genomeSequenceFastaFile +// = Paths.get(GeneBuilderTest.class.getResource("/gene/Homo_sapiens.GRCh38.fa").toURI()); +// Path geneDirectoryPath = Paths.get(GeneBuilderTest.class.getResource("/gene").toURI()); +// // put the results in /tmp +// CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gene", +// true); +// SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", +// "human", null, null, null); +// geneParser = new GeneBuilder(geneDirectoryPath, genomeSequenceFastaFile, species, serializer); +// jsonObjectMapper = new ObjectMapper(); +// jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); +// jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); +// geneParser.parse(); +// } catch (Exception e) { +// e.printStackTrace(); +// } } @Test @@ -226,36 +226,36 @@ public void testProteinSequence() throws Exception { } } - @Test - @Disabled - public void testaddTranscriptTfbstoList() throws Exception { - String attributes = "binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116;stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB"; - String source = null; - String sequenceName = "1"; - String feature = "TF_binding_site"; - int start = 10000; - int end = 100100; - String score = "1.2870005"; - String strand = "+"; - String frame = null; - - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attributes); - Gtf transcript = new Gtf(sequenceName, source, feature, start, end, score, strand, frame, new HashMap<>()); - - List transcriptTfbs = geneParser.addTranscriptTfbstoList(tfbs, transcript,"1", new ArrayList<>()); - - assertEquals(1, transcriptTfbs.size()); - TranscriptTfbs result = transcriptTfbs.get(0); - - assertEquals(sequenceName, result.getChromosome()); - assertEquals(feature, result.getType()); - assertEquals(start, result.getStart()); - assertEquals(end, result.getEnd()); - assertEquals(score, String.valueOf(result.getScore())); - assertEquals("ENSPFM0542", result.getPfmId()); - assertEquals("ENSM00208374688", result.getId()); - assertEquals(2, result.getTranscriptionFactors().size()); - } +// @Test +// @Disabled +// public void testaddTranscriptTfbstoList() throws Exception { +// String attributes = "binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116;stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB"; +// String source = null; +// String sequenceName = "1"; +// String feature = "TF_binding_site"; +// int start = 10000; +// int end = 100100; +// String score = "1.2870005"; +// String strand = "+"; +// String frame = null; +// +// Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attributes); +// Gtf transcript = new Gtf(sequenceName, source, feature, start, end, score, strand, frame, new HashMap<>()); +// +// List transcriptTfbs = geneParser.addTranscriptTfbstoList(tfbs, transcript,"1", new ArrayList<>()); +// +// assertEquals(1, transcriptTfbs.size()); +// TranscriptTfbs result = transcriptTfbs.get(0); +// +// assertEquals(sequenceName, result.getChromosome()); +// assertEquals(feature, result.getType()); +// assertEquals(start, result.getStart()); +// assertEquals(end, result.getEnd()); +// assertEquals(score, String.valueOf(result.getScore())); +// assertEquals("ENSPFM0542", result.getPfmId()); +// assertEquals("ENSM00208374688", result.getId()); +// assertEquals(2, result.getTranscriptionFactors().size()); +// } private List loadSerializedGenes(String fileName) { List geneList = new ArrayList(); From e7c238511fa41ed4763126dfd6512f72cb1f73f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 10 May 2024 11:07:35 +0200 Subject: [PATCH 069/148] lib: update clinical variant downloader by moving the split ClinVar file to the build step, adding log messages, fixing sonnar issues,... #TASK-5575, #TASK-5564 --- .../executors/DownloadCommandExecutor.java | 6 +- .../core/config/DownloadProperties.java | 39 ----- .../org/opencb/cellbase/lib/EtlCommons.java | 40 +++-- .../lib/download/AbstractDownloadManager.java | 16 +- .../lib/download/ClinicalDownloadManager.java | 156 ++++++------------ 5 files changed, 87 insertions(+), 170 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 8da49800df..5a0fb00877 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -41,7 +41,7 @@ public class DownloadCommandExecutor extends CommandExecutor { private Path outputDirectory; private static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, - MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, ONTOLOGY_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { @@ -86,7 +86,7 @@ public void execute() throws CellBaseException { case CONSERVATION_DATA: downloadFiles.addAll(downloader.downloadConservation()); break; - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: downloadFiles.addAll(downloader.downloadClinicalVariants()); break; case REPEATS_DATA: @@ -132,7 +132,7 @@ private List checkDataSources() { case REGULATION_DATA: case PROTEIN_DATA: case CONSERVATION_DATA: - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: case REPEATS_DATA: case ONTOLOGY_DATA: case PUBMED_DATA: diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index bb44f91138..a52e7ce544 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -44,10 +44,6 @@ public class DownloadProperties { private URLProperties phylop; private URLProperties gerp; private URLProperties clinvar; - private URLProperties clinvarVariation; - private URLProperties clinvarSummary; - private URLProperties clinvarVariationAllele; - private URLProperties clinvarEfoTerms; private URLProperties cosmic; private URLProperties hgmd; private URLProperties dgv; @@ -225,41 +221,6 @@ public DownloadProperties setClinvar(URLProperties clinvar) { return this; } - public URLProperties getClinvarVariation() { - return clinvarVariation; - } - - public DownloadProperties setClinvarVariation(URLProperties clinvarVariation) { - this.clinvarVariation = clinvarVariation; - return this; - } - - public URLProperties getClinvarSummary() { - return clinvarSummary; - } - - public DownloadProperties setClinvarSummary(URLProperties clinvarSummary) { - this.clinvarSummary = clinvarSummary; - return this; - } - - public URLProperties getClinvarVariationAllele() { - return clinvarVariationAllele; - } - - public void setClinvarVariationAllele(URLProperties clinvarVariationAllele) { - this.clinvarVariationAllele = clinvarVariationAllele; - } - - public URLProperties getClinvarEfoTerms() { - return clinvarEfoTerms; - } - - public DownloadProperties setClinvarEfoTerms(URLProperties clinvarEfoTerms) { - this.clinvarEfoTerms = clinvarEfoTerms; - return this; - } - public URLProperties getCosmic() { return cosmic; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e0a19c7114..57a592bb54 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -47,6 +47,10 @@ public final class EtlCommons { // Commons public static final String XLSX_EXTENSION = ".xlsx"; public static final String CSV_EXTENSION = ".csv"; + public static final String TBI_EXTENSION = ".tbi"; + public static final String FAI_EXTENSION = ".fai"; + + public static final String OK_LOG_MESSAGE = "Ok."; // Ensembl public static final String ENSEMBL_DATA = "ensembl"; @@ -139,7 +143,6 @@ public final class EtlCommons { public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; public static final String VARIATION_DATA = "variation"; - public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics @@ -165,10 +168,10 @@ public final class EtlCommons { public static final String REVEL_FILE_ID = "REVEL"; // Clinical variants data - public static final String CLINICAL_VARIANTS_SUBDIRECTORY = "clinicalVariant"; + public static final String CLINICAL_VARIANT_DATA = "clinical_variant"; + public static final String CLINICAL_VARIANTS_BASENAME = "clinicalVariant"; // ClinVar - public static final String CLINVAR_NAME = "ClinVar"; - public static final String CLINVAR_VERSION_FILENAME = "clinvar" + SUFFIX_VERSION_FILENAME; + public static final String CLINVAR_DATA = "clinvar"; public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; // Must match the configuration file public static final String CLINVAR_FULL_RELEASE_FILE_ID = "FULL_RELEASE"; @@ -176,20 +179,18 @@ public final class EtlCommons { public static final String CLINVAR_ALLELE_FILE_ID = "ALLELE"; public static final String CLINVAR_EFO_TERMS_FILE_ID = "EFO_TERMS"; // COSMIC - public static final String COSMIC_NAME = "COSMIC"; - public static final String COSMIC_VERSION_FILENAME = "cosmic" + SUFFIX_VERSION_FILENAME; + public static final String COSMIC_DATA = "cosmic"; // Must match the configuration file public static final String COSMIC_FILE_ID = "COSMIC"; // HGMD - public static final String HGMD_NAME = "HGMD"; - public static final String HGMD_VERSION_FILENAME = "hgmd" + SUFFIX_VERSION_FILENAME; + public static final String HGMD_DATA = "hgmd"; // Must match the configuration file public static final String HGMD_FILE_ID = "HGMD"; // GWAS - public static final String GWAS_NAME = "GWAS catalog"; - public static final String GWAS_VERSION_FILENAME = "gwas" + SUFFIX_VERSION_FILENAME; + public static final String GWAS_DATA = "gwas"; // Must match the configuration file public static final String GWAS_FILE_ID = "GWAS"; + public static final String GWAS_DBSNP_FILE_ID = "DBSNP"; // Repeats public static final String REPEATS_DATA = "repeats"; @@ -345,7 +346,7 @@ public final class EtlCommons { dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); dataNamesMap.put(HPO_DATA, "HPO"); dataNamesMap.put(DISGENET_DATA, "DisGeNet"); - dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraints"); + dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraint"); dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); dataNamesMap.put(PROTEIN_DATA, "Protein"); dataNamesMap.put(UNIPROT_DATA, "UniProt"); @@ -372,10 +373,15 @@ public final class EtlCommons { dataNamesMap.put(PUBMED_DATA, "PubMed"); dataNamesMap.put(PHARMACOGENOMICS_DATA, "Pharmacogenomics"); dataNamesMap.put(PHARMGKB_DATA, "PharmGKB"); - dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); + dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Score"); dataNamesMap.put(CADD_DATA, "CADD"); - dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Scores"); + dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Score"); dataNamesMap.put(REVEL_DATA, "Revel"); + dataNamesMap.put(CLINICAL_VARIANT_DATA, "Clinical Variant"); + dataNamesMap.put(CLINVAR_DATA, "ClinVar"); + dataNamesMap.put(COSMIC_DATA, "Cosmic"); + dataNamesMap.put(HGMD_DATA, "HGMD"); + dataNamesMap.put(GWAS_DATA, "GWAS Catalog"); // Populate data categories map dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); @@ -413,6 +419,10 @@ public final class EtlCommons { dataCategoriesMap.put(PHARMGKB_DATA, dataNamesMap.get(PHARMACOGENOMICS_DATA)); dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); + dataCategoriesMap.put(CLINVAR_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(COSMIC_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(HGMD_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(GWAS_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); @@ -450,6 +460,10 @@ public final class EtlCommons { dataVersionFilenamesMap.put(PHARMGKB_DATA, "pharmGKB" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CLINVAR_DATA, "clinVar" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(COSMIC_DATA, "cosmic" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGMD_DATA, "hgmd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GWAS_DATA, "gwas" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 7c4e331f18..7ac8bcf800 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -52,9 +52,9 @@ public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done!"; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}"; protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; - protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done!"; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}/{}"; protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; @@ -195,8 +195,8 @@ protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.Ensem DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); // Save data source - saveDataSource(data, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - outPath.resolve(getDataVersionFilename(data))); + saveDataSource(data, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), + Collections.singletonList(downloadFile.getUrl()), outPath.resolve(getDataVersionFilename(data))); return downloadFile; } @@ -226,7 +226,9 @@ protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); - return downloadFile(url, outFile.toString()); + DownloadFile downloadFile = downloadFile(url, outFile.toString()); + logger.info(OK_LOG_MESSAGE); + return downloadFile; } protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) @@ -240,7 +242,9 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); - return downloadFile(url, outFile.toString()); + DownloadFile downloadFile = downloadFile(url, outFile.toString()); + logger.info(OK_LOG_MESSAGE); + return downloadFile; } protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 77f658626a..9fd0e7562c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -20,12 +20,8 @@ import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; -import java.io.BufferedReader; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -50,114 +46,56 @@ public List download() throws IOException, InterruptedException, C } public List downloadClinical() throws IOException, InterruptedException, CellBaseException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY).toAbsolutePath(); - Files.createDirectories(clinicalFolder); - logger.info("Downloading clinical information at {} ...", clinicalFolder); - - String url; - List urls; - Path outPath; - DownloadProperties.URLProperties props; - - DownloadFile downloadFile; - List downloadFiles = new ArrayList<>(); - - // COSMIC - logger.warn("{} files must be downloaded manually !", COSMIC_NAME); - props = configuration.getDownload().getCosmic(); - urls = Collections.singletonList(props.getHost() + props.getFiles().get(COSMIC_FILE_ID)); - // Save data source - saveDataSource(EtlCommons.CLINICAL_VARIANTS_DATA, COSMIC_NAME, props.getVersion(), getTimeStamp(), urls, - clinicalFolder.resolve(COSMIC_VERSION_FILENAME)); - - // HGMD - logger.warn("{} files must be downloaded manually !", HGMD_NAME); - props = configuration.getDownload().getHgmd(); - urls = Collections.singletonList(props.getHost() + props.getFiles().get(HGMD_FILE_ID)); - // Save data source - saveDataSource(EtlCommons.CLINICAL_VARIANTS_DATA, HGMD_NAME, props.getVersion(), getTimeStamp(), urls, - clinicalFolder.resolve(HGMD_VERSION_FILENAME)); - - // GWAS catalog - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_NAME, - CLINICAL_VARIANTS_DATA, GWAS_VERSION_FILENAME, clinicalFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{} not supported for the species {}", getDataName(CLINICAL_VARIANT_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + // Create clinical directory + Path clinicalPath = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANT_DATA).toAbsolutePath(); + Files.createDirectories(clinicalPath); + + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); + + // ClinVar + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINVAR_DATA)); + DownloadProperties.URLProperties props = configuration.getDownload().getClinvar(); + List urls = new ArrayList<>(); + for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, + CLINVAR_EFO_TERMS_FILE_ID)) { + downloadFile = downloadDataSource(props, fileId, clinicalPath); downloadFiles.add(downloadFile); - // ClinVar - logger.info("Downloading {}} files ...", CLINVAR_NAME); - props = configuration.getDownload().getClinvar(); - urls = new ArrayList<>(); - for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, - CLINVAR_EFO_TERMS_FILE_ID)) { - url = props.getHost() + props.getFiles().get(fileId); - outPath = clinicalFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outPath); - downloadFiles.add(downloadFile(url, outPath.toString())); - urls.add(url); - } - // Save data source - saveDataSource(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, props.getVersion(), getTimeStamp(), urls, - clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); - - // Prepare CliVar chunk files - Path chunksPath = clinicalFolder.resolve(CLINVAR_CHUNKS_SUBDIRECTORY); - if (Files.notExists(chunksPath)) { - Files.createDirectories(chunksPath); - Path clinvarPath = clinicalFolder.resolve(getFilenameFromUrl( - props.getHost() + props.getFiles().get(CLINVAR_FULL_RELEASE_FILE_ID))); - logger.info("Splitting {} in {} ...", clinvarPath, chunksPath); - splitClinvar(clinvarPath, chunksPath); - } - - return downloadFiles; + // Save URLs to be written in the version file + urls.add(downloadFile.getUrl()); } - return Collections.emptyList(); - } + // Save data source + saveDataSource(CLINVAR_DATA, props.getVersion(), getTimeStamp(), urls, clinicalPath.resolve(getDataVersionFilename(CLINVAR_DATA))); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CLINVAR_DATA)); + + // COSMIC + logger.warn("{} files must be downloaded manually !", getDataName(COSMIC_DATA)); + props = configuration.getDownload().getCosmic(); + String url = props.getHost() + props.getFiles().get(COSMIC_FILE_ID); + saveDataSource(COSMIC_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(COSMIC_DATA))); + + // HGMD + logger.warn("{} files must be downloaded manually !", getDataName(HGMD_DATA)); + props = configuration.getDownload().getHgmd(); + url = props.getHost() + props.getFiles().get(HGMD_FILE_ID); + saveDataSource(HGMD_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(HGMD_DATA))); + + // GWAS catalog + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GWAS_DATA)); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_DATA, clinicalPath); + downloadFiles.add(downloadFile); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GWAS_DATA)); - private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - PrintWriter pw = null; - try (BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath)) { - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - if (pw != null) { - pw.print(""); - pw.close(); - } - chunk++; - } - } - } - if (pw != null) { - pw.print(""); - pw.close(); - } - } + return downloadFiles; } } From f5b7c34d17815b50e4112fde967b9a055a459d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 10 May 2024 11:09:44 +0200 Subject: [PATCH 070/148] lib: update clinical variant builder by including the split ClinVar file to the build step, adding checks and log messages, fixing sonnar issues,... #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 37 ++-- .../lib/builders/CellBaseBuilder.java | 52 ++++- .../clinical/variant/ClinVarIndexer.java | 4 +- .../clinical/variant/ClinicalIndexer.java | 2 +- .../variant/ClinicalVariantBuilder.java | 206 +++++++++++++----- .../clinical/variant/CosmicIndexer.java | 2 +- .../clinical/variant/HGMDIndexer.java | 2 +- .../variant/ClinicalVariantBuilderTest.java | 6 +- 8 files changed, 222 insertions(+), 89 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 081880ebe3..899e5f52d3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -39,7 +39,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -65,7 +64,7 @@ public class BuildCommandExecutor extends CommandExecutor { private SpeciesConfiguration speciesConfiguration; private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, - MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { @@ -150,7 +149,7 @@ public void execute() throws CellBaseException { case CONSERVATION_DATA: parser = buildConservation(); break; - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: parser = buildClinicalVariants(); break; case REPEATS_DATA: @@ -306,30 +305,24 @@ private CellBaseBuilder buildConservation() throws CellBaseException { } private CellBaseBuilder buildClinicalVariants() throws CellBaseException { - Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY); - - List versionFiles = new ArrayList<>(); - List versionFilenames = Arrays.asList(CLINVAR_VERSION_FILENAME, COSMIC_VERSION_FILENAME, GWAS_VERSION_FILENAME, - HGMD_VERSION_FILENAME); - for (String versionFilename : versionFilenames) { - Path versionFile = clinicalVariantFolder.resolve(versionFilename); - if (!versionFile.toFile().exists()) { - throw new CellBaseException("Could not build clinical variants because of the file " + versionFilename + " does not exist"); - } - versionFiles.add(versionFile); - } - copyVersionFiles(versionFiles); - - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, - EtlCommons.CLINICAL_VARIANTS_JSON_FILE.replace(".json.gz", ""), true); - return new ClinicalVariantBuilder(clinicalVariantFolder, normalize, getFastaReferenceGenome(), + // Sanity check + Path clinicalDownloadPath = downloadFolder.resolve(CLINICAL_VARIANT_DATA); + Path clinicalBuildPath = buildFolder.resolve(CLINICAL_VARIANT_DATA); + copyVersionFiles(Arrays.asList(clinicalDownloadPath.resolve(getDataVersionFilename(CLINVAR_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(COSMIC_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(HGMD_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(GWAS_DATA))), clinicalBuildPath); + + // Create the file serializer and the clinical variants builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(clinicalBuildPath, CLINICAL_VARIANTS_BASENAME, true); + return new ClinicalVariantBuilder(clinicalDownloadPath, normalize, getFastaReferenceGenome(), buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, configuration, serializer); } private String getDefaultHumanAssembly() { for (SpeciesConfiguration species : configuration.getSpecies().getVertebrates()) { - if (species.getId().equals("hsapiens")) { + if (species.getId().equals(HSAPIENS_NAME)) { return species.getAssemblies().get(0).getName(); } } @@ -461,7 +454,7 @@ private List checkDataSources() { case REGULATION_DATA: case PROTEIN_DATA: case CONSERVATION_DATA: - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: case REPEATS_DATA: case ONTOLOGY_DATA: case SPLICE_SCORE_DATA: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 26fb2e838b..fe1b5fe648 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -19,17 +19,21 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; @@ -48,7 +52,7 @@ public abstract class CellBaseBuilder { protected Logger logger; public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; - public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking done!"; + public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!"; public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!"; @@ -59,7 +63,6 @@ public abstract class CellBaseBuilder { public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!"; - public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); @@ -79,6 +82,24 @@ public void disconnect() { } } + protected File checkFile(String data, DownloadProperties.URLProperties props, String fileId, Path targetPath) throws CellBaseException { + logger.info("Checking file {}/{} ...", getDataName(data), fileId); + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " does not exist in the configuration file in the section '" + data + "'"); + } + if (!Files.exists(targetPath)) { + throw new CellBaseException("Folder does not exist " + targetPath); + } + + String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); + Path filePath = targetPath.resolve(filename); + if (!Files.exists(filePath)) { + throw new CellBaseException(getDataName(data) + " file " + filePath + " does not exist"); + } + logger.info("Ok."); + return filePath.toFile(); + } + protected List checkFiles(String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { return checkFiles(getDataName(data), data, downloadPath, expectedFiles); } @@ -94,7 +115,7 @@ protected List checkFiles(String label, String data, Path downloadPath, in } protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { - logger.info("Checking {} folder and files", name); + logger.info("Checking {} folder and files ...", name); if (!targetPath.toFile().exists()) { throw new CellBaseException(name + " folder does not exist " + targetPath); } @@ -110,7 +131,30 @@ protected List checkFiles(DataSource dataSource, Path targetPath, String n files.add(file); } } - + logger.info("Ok."); return files; } + + protected Path getIndexFastaReferenceGenome(Path fastaPath) throws CellBaseException { + Path indexFastaPath = Paths.get(fastaPath + FAI_EXTENSION); + if (!Files.exists(indexFastaPath)) { + // Index FASTA file + logger.info("Indexing FASTA file {} ...", fastaPath); + String errorMsg = "Error executing 'samtools faidx' for FASTA file "; + try { + List params = Arrays.asList("faidx", fastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "samtools", params, null); + } catch (IOException e) { + throw new CellBaseException(errorMsg + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException(errorMsg + fastaPath, e); + } + if (!Files.exists(indexFastaPath)) { + throw new CellBaseException("It could not index the FASTA file " + fastaPath + ". Please, try to do it manually!"); + } + } + return indexFastaPath; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index 7e5baa9e6d..951ea5c530 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -309,7 +309,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation String mateVariantString, String clinicalHaplotypeString, Map traitsToEfoTermsMap) { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, version, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // Create a set to avoid situations like germline;germline;germline List alleleOrigin = null; if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) { @@ -390,7 +390,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, version, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); VariantClassification variantClassification = getVariantClassification( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java index bbe33017fd..3f6e87b89c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java @@ -83,7 +83,7 @@ public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException { .setDecomposeMNVs(true); if (genomeSequenceFilePath != null) { - logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString()); + logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath); variantNormalizerConfig.enableLeftAlign(genomeSequenceFilePath.toString()); } else { logger.info("Left alignment is NOT enabled."); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index 41b701fdbe..e3c7ab3ff8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -23,100 +23,155 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by fjlopez on 26/09/16. */ public class ClinicalVariantBuilder extends CellBaseBuilder { - private final Path clinicalVariantFolder; + private final Path clinicalVariantPath; private final String assembly; private final Path genomeSequenceFilePath; private boolean normalize; + private Path clinvarFullReleaseFilePath; + private Path clinvarSummaryFilePath; + private Path clinvarVariationAlleleFilePath; + private Path clinvarEFOFilePath; + private Path cosmicFilePath; + private Path hgmdFilePath; + private Path gwasFilePath; + private Path gwasDbSnpFilePath; + private final CellBaseConfiguration configuration; public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath, String assembly, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); - this.clinicalVariantFolder = clinicalVariantFolder; + this.clinicalVariantPath = clinicalVariantFolder; this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; this.configuration = configuration; } + public void check() throws CellBaseException, IOException { + if (checked) { + return; + } + + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + + // Sanity check + checkDirectory(clinicalVariantPath, getDataName(CLINICAL_VARIANT_DATA)); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check genome file + logger.info("Checking genome FASTA file ..."); + if (!Files.exists(genomeSequenceFilePath)) { + throw new CellBaseException("Genome file path does not exist " + genomeSequenceFilePath); + } + logger.info(OK_LOG_MESSAGE); + logger.info("Checking index for genome FASTA file ..."); + getIndexFastaReferenceGenome(genomeSequenceFilePath); + logger.info(OK_LOG_MESSAGE); + + // Check ClinVar files + clinvarFullReleaseFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_FULL_RELEASE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarSummaryFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_SUMMARY_FILE_ID, + clinicalVariantPath).toPath(); + clinvarVariationAlleleFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_ALLELE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarEFOFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_EFO_TERMS_FILE_ID, + clinicalVariantPath).toPath(); + + // Check COSMIC file + cosmicFilePath = checkFiles(COSMIC_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check HGMD file + hgmdFilePath = checkFiles(HGMD_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check GWAS files + gwasFilePath = checkFiles(GWAS_DATA, clinicalVariantPath, 1).get(0).toPath(); + String dbSnpFilename = Paths.get(configuration.getDownload().getGwasCatalog().getFiles().get(GWAS_DBSNP_FILE_ID)).getFileName() + .toString(); + gwasDbSnpFilePath = clinicalVariantPath.resolve(dbSnpFilename); + if (!Files.exists(gwasDbSnpFilePath)) { + throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbSnpFilename + " is missing at " + + clinicalVariantPath); + } + if (!Files.exists(clinicalVariantPath.resolve(dbSnpFilename + TBI_EXTENSION))) { + throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + dbSnpFilename + TBI_EXTENSION + + " is missing at " + clinicalVariantPath); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + checked = true; + } + public void parse() throws IOException, RocksDBException, CellBaseException { + check(); + + // Prepare ClinVar chunk files before building (if necessary) + Path chunksPath = serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY); + if (Files.notExists(chunksPath)) { + Files.createDirectories(chunksPath); + logger.info("Splitting CliVar file {} in {} ...", clinvarFullReleaseFilePath, chunksPath); + splitClinvar(clinvarFullReleaseFilePath, chunksPath); + logger.info(OK_LOG_MESSAGE); + } + RocksDB rdb = null; Options dbOption = null; String dbLocation = null; try { - Object[] dbConnection = getDBConnection(clinicalVariantFolder.toString() + "/integration.idx", true); + Object[] dbConnection = getDBConnection(clinicalVariantPath.toString() + "/integration.idx", true); rdb = (RocksDB) dbConnection[0]; dbOption = (Options) dbConnection[1]; dbLocation = (String) dbConnection[2]; // COSMIC - // IMPORTANT: COSMIC must be indexed first (before ClinVar, IARC TP53, DOCM, HGMD,...)!!! - Path cosmicFile = clinicalVariantFolder.resolve(configuration.getDownload().getCosmic().getFiles().get(0)); - if (cosmicFile != null && Files.exists(cosmicFile)) { - CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFile, configuration.getDownload().getCosmic().getVersion(), - normalize, genomeSequenceFilePath, assembly, rdb); - cosmicIndexer.index(); - } else { - throw new CellBaseException("Could not build clinical variants: the COSMIC file " + cosmicFile + " is missing"); - } + // IMPORTANT: COSMIC must be indexed first (before ClinVar, HGMD,...)!!! + CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFilePath, configuration.getDownload().getCosmic().getVersion(), + normalize, genomeSequenceFilePath, assembly, rdb); + cosmicIndexer.index(); // ClinVar - Path clinvarXMLFile = getPathFromHost(configuration.getDownload().getClinvar().getHost()); - Path clinvarSummaryFile = getPathFromHost(configuration.getDownload().getClinvarSummary().getHost()); - Path clinvarVariationAlleleFile = getPathFromHost(configuration.getDownload().getClinvarVariationAllele().getHost()); - Path clinvarEFOFile = getPathFromHost(configuration.getDownload().getClinvarEfoTerms().getHost()); - ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile, - clinvarVariationAlleleFile, clinvarEFOFile, configuration.getDownload().getClinvar().getVersion(), normalize, - genomeSequenceFilePath, assembly, rdb); + ClinVarIndexer clinvarIndexer = new ClinVarIndexer(serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY), + clinvarSummaryFilePath, clinvarVariationAlleleFilePath, clinvarEFOFilePath, configuration.getDownload().getClinvar() + .getVersion(), normalize, genomeSequenceFilePath, assembly, rdb); clinvarIndexer.index(); // HGMD - Path hgmdFile = clinicalVariantFolder.resolve(configuration.getDownload().getHgmd().getFiles().get(0)); - if (hgmdFile != null && Files.exists(hgmdFile)) { - HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFile, configuration.getDownload().getHgmd().getVersion(), normalize, - genomeSequenceFilePath, assembly, rdb); - hgmdIndexer.index(); - } else { - throw new CellBaseException("Could not build clinical variants: the HGMD file " + hgmdFile + " is missing"); - } + HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFilePath, configuration.getDownload().getHgmd().getVersion(), normalize, + genomeSequenceFilePath, assembly, rdb); + hgmdIndexer.index(); // GWAS catalog - Path gwasFile = clinicalVariantFolder.resolve(Paths.get(configuration.getDownload().getGwasCatalog().getHost()).getFileName()); - if (gwasFile != null && Files.exists(gwasFile)) { - Path dbsnpFile = clinicalVariantFolder.resolve(configuration.getDownload().getGwasCatalog().getFiles().get(0)); - if (dbsnpFile != null && Files.exists(dbsnpFile)) { - Path tabixFile = Paths.get(dbsnpFile.toAbsolutePath() + ".tbi"); - if (tabixFile != null && Files.exists(tabixFile)) { - GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbsnpFile, genomeSequenceFilePath, assembly, rdb); - gwasIndexer.index(); - } else { - throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + tabixFile + " is missing"); - } - } else { - throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbsnpFile + " is missing"); - } - } else { - throw new CellBaseException("Could not build clinical variants: the GWAS catalog file " + gwasFile + " is missing"); - } + GwasIndexer gwasIndexer = new GwasIndexer(gwasFilePath, gwasDbSnpFilePath, genomeSequenceFilePath, assembly, rdb); + gwasIndexer.index(); + // Serialize serializeRDB(rdb); closeIndex(rdb, dbOption, dbLocation); serializer.close(); @@ -127,14 +182,6 @@ public void parse() throws IOException, RocksDBException, CellBaseException { } } - private Path getPathFromHost(String host) throws CellBaseException { - Path path = clinicalVariantFolder.resolve(Paths.get(host).getFileName()); - if (!Files.exists(path)) { - throw new CellBaseException("Could not build clinical variants. The file " + path + " is missing"); - } - return path; - } - private void serializeRDB(RocksDB rdb) throws IOException { // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's // named "iterator" @@ -169,7 +216,7 @@ private Variant parseVariantFromVariantId(String variantId) { return new Variant(parts[0].trim(), Integer.parseInt(parts[1].trim()), parts[2], parts[3]); } } catch (Exception e) { - logger.warn(e.getMessage() + ". Impossible to create the variant object from the variant ID: " + variantId); + logger.warn("{}. Impossible to create the variant object from the variant ID: {}", e.getMessage(), variantId); return null; } } @@ -221,4 +268,53 @@ private Object[] getDBConnection(String dbLocation, boolean forceCreate) { } + private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + if (pw != null) { + pw.print(""); + pw.close(); + } + } finally { + if (pw != null) { + pw.close(); + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index c772501738..51be2b6f31 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -471,7 +471,7 @@ private EvidenceEntry buildCosmic(String[] fields) { String id = fields[ID_COLUMN]; String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_NAME, version, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, version, null); SomaticInformation somaticInformation = getSomaticInformation(fields); List genomicFeatureList = getGenomicFeature(fields); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java index 2c0d2b3d27..f132f4b9e8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java @@ -95,7 +95,7 @@ private void parseHgmdInfo(Variant variant) { } // Source - entry.setSource(new EvidenceSource(EtlCommons.HGMD_NAME, version, null)); + entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, null)); // Assembly entry.setAssembly(assembly); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java index fc5df3af35..aea3b9e7fe 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java @@ -89,7 +89,7 @@ public void noNormaliseTest() throws Exception { .getResource("/variant/annotation/clinicalVariant/ClinVarFullRelease_2020-02.xml.gz").toURI()).toFile(), clinicalVariantChunksFolder.resolve("ClinVarFullRelease_2020-02.xml.gz").toFile()); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANT_DATA, true); (new ClinicalVariantBuilder(clinicalVariantFolder, false, genomeSequenceFilePath, "GRCh37", null, serializer)).parse(); List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); @@ -145,7 +145,7 @@ public void parseMNVTest() throws Exception { Path genomeSequenceFilePath = clinicalVariantFolder.resolve("Homo_sapiens.GRCh37.75.dna.primary_assembly.chr17.fa.gz"); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANT_DATA, true); (new ClinicalVariantBuilder(clinicalVariantFolder, true, genomeSequenceFilePath, "GRCh37", null, serializer)).parse(); List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); @@ -230,7 +230,7 @@ public void parse() throws Exception { .getResource("/variant/annotation/clinicalVariant/ClinVarFullRelease_2020-02.xml.gz").toURI()).toFile(), clinicalVariantChunksFolder.resolve("ClinVarFullRelease_2020-02.xml.gz").toFile()); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANT_DATA, true); (new ClinicalVariantBuilder(clinicalVariantFolder, true, genomeSequenceFilePath, "GRCh37", null, serializer)).parse(); List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); From a4fca6bb6b3c82b8d82422226322fa8af06c41b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 10 May 2024 11:10:36 +0200 Subject: [PATCH 071/148] lib: update code to the last changes, #TASK-5564 --- .../cellbase/app/cli/admin/AdminCliOptionsParser.java | 4 ++-- .../app/cli/admin/executors/ExportCommandExecutor.java | 6 +++--- .../app/cli/admin/executors/LoadCommandExecutor.java | 6 +++--- .../opencb/cellbase/lib/builders/EnsemblGeneBuilder.java | 2 +- .../opencb/cellbase/lib/builders/RefSeqGeneBuilder.java | 2 +- .../cellbase/lib/download/GenomeDownloadManager.java | 7 +++++++ .../cellbase/lib/download/PharmGKBDownloadManager.java | 1 + .../cellbase/lib/download/PubMedDownloadManager.java | 1 + 8 files changed, 19 insertions(+), 10 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 1bda7d2793..15396663a4 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -91,7 +91,7 @@ public class DownloadCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA - + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) public String data; @@ -108,7 +108,7 @@ public class BuildCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," - + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to build everything", required = true, arity = 1) public String data; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 85446fac1f..4fba479a36 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -85,7 +85,7 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo this.dataToExport = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, ONTOLOGY_DATA, MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); @@ -293,7 +293,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " protein functional predictions"; break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { counter = exportClinicalVariantData(regions); counterMsg = counter + " clinical variants"; break; @@ -424,7 +424,7 @@ private String exportPharmacogenomicsData(List genes) private int exportClinicalVariantData(List regions) throws CellBaseException, QueryException, IllegalAccessException, IOException { - String baseFilename = CLINICAL_VARIANTS_DATA + ".full"; + String baseFilename = CLINICAL_VARIANT_DATA + ".full"; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, baseFilename); ClinicalManager clinicalManager = managerFactory.getClinicalManager(species, assembly); ClinicalVariantQuery query = new ClinicalVariantQuery(); diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 166c4e7a6f..0eb53b4ad4 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -81,7 +81,7 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO loadOptions = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { @@ -257,7 +257,7 @@ public void execute() throws CellBaseException { loadProteinFunctionalPrediction(); break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { // Load data, create index and update release loadClinical(); break; @@ -461,7 +461,7 @@ private void loadClinical() throws FileNotFoundException { input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); - dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANTS_DATA, sources); + dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANT_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { logger.error(e.toString()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index a7e6b9f1cf..d6b935fa52 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -183,7 +183,7 @@ public void check() throws Exception { // Check genome fasta file genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath(); - logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); checked = true; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 56e1edd6ff..8f03a801f2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -137,7 +137,7 @@ public void check() throws Exception { throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); } - logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); checked = true; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 289ec23258..9b967eb052 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -102,6 +102,7 @@ public List downloadConservation() throws IOException, Interrupted outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); phastconsUrls.add(phastConsUrl); // PhyloP @@ -112,6 +113,7 @@ public List downloadConservation() throws IOException, Interrupted outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); phyloPUrls.add(phyloPUrl); } @@ -123,6 +125,8 @@ public List downloadConservation() throws IOException, Interrupted outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + // Save data version saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, @@ -162,6 +166,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); @@ -171,6 +176,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); @@ -181,6 +187,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 2eeac8415f..25ad390650 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -55,6 +55,7 @@ public List download() throws IOException, InterruptedException, C Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); + logger.info(OK_LOG_MESSAGE); downloadFiles.add(downloadFile); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 6451fd76aa..9006be7a7d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -51,6 +51,7 @@ public List download() throws IOException, InterruptedException, C String url = host + filename; logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedDownloadFolder.resolve(filename)); downloadFiles.add(downloadFile(url, pubmedDownloadFolder.resolve(filename).toString())); + logger.info(OK_LOG_MESSAGE); } // Save data source From 57c6f6f029debad8b4c197599a51c781ab72fac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Sat, 11 May 2024 08:22:58 +0200 Subject: [PATCH 072/148] lib: include SpliceAI/MMSplice in the configuration file, and create the splice score downloader to create the splice score version files, #TASK-5575, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 2 + .../admin/executors/BuildCommandExecutor.java | 8 +-- .../executors/DownloadCommandExecutor.java | 23 ++----- .../admin/executors/LoadCommandExecutor.java | 10 +-- .../core/config/DownloadProperties.java | 20 ++++++ .../src/main/resources/configuration.yml | 8 +++ .../org/opencb/cellbase/lib/EtlCommons.java | 19 ++++-- .../cellbase/lib/builders/SpliceBuilder.java | 19 +++--- .../lib/download/AbstractDownloadManager.java | 6 +- .../cellbase/lib/download/Downloader.java | 5 ++ .../download/SpliceScoreDownloadManager.java | 67 +++++++++++++++++++ 11 files changed, 145 insertions(+), 42 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 15396663a4..17341eb8c6 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -17,9 +17,11 @@ package org.opencb.cellbase.app.cli.admin; import com.beust.jcommander.*; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 899e5f52d3..0c342782cc 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -357,16 +357,16 @@ private Path getFastaReferenceGenome() throws CellBaseException { return fastaPath; } - private CellBaseBuilder buildSplice() throws IOException { + private CellBaseBuilder buildSplice() throws IOException, CellBaseException { Path spliceInputFolder = downloadFolder.resolve(EtlCommons.SPLICE_SCORE_DATA); Path spliceOutputFolder = buildFolder.resolve(EtlCommons.SPLICE_SCORE_DATA); if (!spliceOutputFolder.toFile().exists()) { spliceOutputFolder.toFile().mkdirs(); } - if (spliceInputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME).toFile().exists()) { - Files.copy(spliceInputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME), - spliceOutputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME), + if (spliceInputFolder.resolve(getDataVersionFilename(MMSPLICE_DATA)).toFile().exists()) { + Files.copy(spliceInputFolder.resolve(getDataVersionFilename(MMSPLICE_DATA)), + spliceOutputFolder.resolve(EtlCommons.getDataVersionFilename(MMSPLICE_DATA)), StandardCopyOption.REPLACE_EXISTING); } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 5a0fb00877..f309b22041 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -40,9 +40,9 @@ public class DownloadCommandExecutor extends CommandExecutor { private AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; private Path outputDirectory; - private static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + public static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, - ONTOLOGY_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { super(downloadCommandOptions.commonOptions.logLevel, downloadCommandOptions.commonOptions.conf); @@ -95,6 +95,9 @@ public void execute() throws CellBaseException { case ONTOLOGY_DATA: downloadFiles.addAll(downloader.downloadOntologies()); break; + case SPLICE_SCORE_DATA: + downloadFiles.addAll(downloader.downloadSpliceScores()); + break; case PUBMED_DATA: downloadFiles.addAll(downloader.downloadPubMed()); break; @@ -124,21 +127,7 @@ private List checkDataSources() { } List dataList = Arrays.asList(downloadCommandOptions.data.split(",")); for (String data : dataList) { - switch (data) { - case GENOME_DATA: - case GENE_DATA: - case VARIATION_FUNCTIONAL_SCORE_DATA: - case MISSENSE_VARIATION_SCORE_DATA: - case REGULATION_DATA: - case PROTEIN_DATA: - case CONSERVATION_DATA: - case CLINICAL_VARIANT_DATA: - case REPEATS_DATA: - case ONTOLOGY_DATA: - case PUBMED_DATA: - case PHARMACOGENOMICS_DATA: - break; - default: + if (!VALID_SOURCES_TO_DOWNLOAD.contains(data)) { throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 0eb53b4ad4..7861f25dbe 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -508,19 +508,19 @@ private void loadSpliceScores() throws NoSuchMethodException, InterruptedExcepti // Load data logger.info("Loading splice scores from '{}'", input); // MMSplice scores - loadSpliceScores(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_SUBDIRECTORY)); + loadSpliceScores(input.resolve(SPLICE_SCORE_DATA + "/" + MMSPLICE_DATA)); // SpliceAI scores - loadSpliceScores(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.SPLICEAI_SUBDIRECTORY)); + loadSpliceScores(input.resolve(SPLICE_SCORE_DATA + "/" + SPLICEAI_DATA)); // Create index createIndex("splice_score"); // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_VERSION_FILENAME), - input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.SPLICEAI_VERSION_FILENAME) + input.resolve(SPLICE_SCORE_DATA + "/" + getDataVersionFilename(MMSPLICE_DATA)), + input.resolve(SPLICE_SCORE_DATA + "/" + getDataVersionFilename(SPLICEAI_DATA)) )); - dataReleaseManager.update(dataRelease, "splice_score", EtlCommons.SPLICE_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, "splice_score", SPLICE_SCORE_DATA, sources); } private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionException, InterruptedException, diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index a52e7ce544..e7564a0b30 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -65,6 +65,8 @@ public class DownloadProperties { private URLProperties mondoObo; private URLProperties goAnnotation; private URLProperties revel; + private URLProperties mmSplice; + private URLProperties spliceAi; private URLProperties pubmed; private URLProperties pharmGKB; @@ -406,6 +408,24 @@ public DownloadProperties setRevel(URLProperties revel) { return this; } + public URLProperties getMmSplice() { + return mmSplice; + } + + public DownloadProperties setMmSplice(URLProperties mmSplice) { + this.mmSplice = mmSplice; + return this; + } + + public URLProperties getSpliceAi() { + return spliceAi; + } + + public DownloadProperties setSpliceAi(URLProperties spliceAi) { + this.spliceAi = spliceAi; + return this; + } + public URLProperties getPubmed() { return pubmed; } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index d95e6d7f06..088a2d3daa 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -269,6 +269,14 @@ download: files: MONDO: mondo.obo + ## Splice score + mmSplice: + host: http://kipoi.org/models/MMSplice/mtsplice/ + version: 2.0 + spliceAi: + host: https://basespace.illumina.com/s/otSPW8hnhaZR + version: 1.3.1 + ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 57a592bb54..aed87ae3e1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -143,7 +143,6 @@ public final class EtlCommons { public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; public static final String VARIATION_DATA = "variation"; - public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; @@ -297,10 +296,11 @@ public final class EtlCommons { public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores - public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; - public static final String MMSPLICE_VERSION_FILENAME = MMSPLICE_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; - public static final String SPLICEAI_SUBDIRECTORY = "spliceai"; - public static final String SPLICEAI_VERSION_FILENAME = SPLICEAI_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String SPLICE_SCORE_DATA = "splice_score"; + // MMSplice + public static final String MMSPLICE_DATA = "mmsplice"; + // SpliceAI + public static final String SPLICEAI_DATA = "spliceai"; /** * @deprecated (when refactoring downloaders, builders and loaders) @@ -335,7 +335,6 @@ public final class EtlCommons { dataNamesMap.put(GENOME_DATA, "Genome"); dataNamesMap.put(GENE_DATA, "Gene"); dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); - dataCategoriesMap.put(REFSEQ_DATA, "Gene"); dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); dataNamesMap.put(LRG_DATA, "LRG"); dataNamesMap.put(HGNC_DATA, "HGNC Gene"); @@ -382,6 +381,10 @@ public final class EtlCommons { dataNamesMap.put(COSMIC_DATA, "Cosmic"); dataNamesMap.put(HGMD_DATA, "HGMD"); dataNamesMap.put(GWAS_DATA, "GWAS Catalog"); + dataNamesMap.put(SPLICE_SCORE_DATA, "Splice Score"); + dataNamesMap.put(MMSPLICE_DATA, "MMSplice"); + dataNamesMap.put(SPLICEAI_DATA, "SpliceAI"); + // Populate data categories map dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); @@ -423,6 +426,8 @@ public final class EtlCommons { dataCategoriesMap.put(COSMIC_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); dataCategoriesMap.put(HGMD_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); dataCategoriesMap.put(GWAS_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(MMSPLICE_DATA, dataNamesMap.get(SPLICE_SCORE_DATA)); + dataCategoriesMap.put(SPLICEAI_DATA, dataNamesMap.get(SPLICE_SCORE_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); @@ -464,6 +469,8 @@ public final class EtlCommons { dataVersionFilenamesMap.put(COSMIC_DATA, "cosmic" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(HGMD_DATA, "hgmd" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(GWAS_DATA, "gwas" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MMSPLICE_DATA, "mmSplice" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(SPLICEAI_DATA, "spliceAi" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java index ddff52328b..f0f08c65d3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java @@ -24,7 +24,6 @@ import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; @@ -35,7 +34,11 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; + +import static org.opencb.cellbase.lib.EtlCommons.MMSPLICE_DATA; +import static org.opencb.cellbase.lib.EtlCommons.SPLICEAI_DATA; public class SpliceBuilder extends CellBaseBuilder { @@ -58,14 +61,14 @@ public void parse() throws Exception { logger.info("Parsing splice files..."); - Path splicePath = spliceDir.resolve(EtlCommons.MMSPLICE_SUBDIRECTORY); + Path splicePath = spliceDir.resolve(MMSPLICE_DATA); if (splicePath.toFile().exists()) { logger.info("Parsing MMSplice data..."); mmspliceParser(splicePath); } else { logger.debug("MMSplice data not found: " + splicePath); } - splicePath = spliceDir.resolve(EtlCommons.SPLICEAI_SUBDIRECTORY); + splicePath = spliceDir.resolve(SPLICEAI_DATA); if (splicePath.toFile().exists()) { logger.info("Parsing SpliceAI data..."); spliceaiParser(splicePath); @@ -85,7 +88,7 @@ public void parse() throws Exception { */ private void mmspliceParser(Path mmsplicePath) throws IOException { // Check output folder: MMSplice - Path mmspliceOutFolder = fileSerializer.getOutdir().resolve(EtlCommons.MMSPLICE_SUBDIRECTORY); + Path mmspliceOutFolder = fileSerializer.getOutdir().resolve(MMSPLICE_DATA); if (!mmspliceOutFolder.toFile().exists()) { mmspliceOutFolder.toFile().mkdirs(); } @@ -177,7 +180,7 @@ private void mmspliceParser(Path mmsplicePath) throws IOException { } // Dump rocksDB to JSON file - dumpRocksDB(EtlCommons.MMSPLICE_SUBDIRECTORY + "/splice_score_mmsplice_chr", rocksDB); + dumpRocksDB(MMSPLICE_DATA + "/splice_score_mmsplice_chr", rocksDB); // Clean up rocksDB.close(); @@ -195,7 +198,7 @@ private void mmspliceParser(Path mmsplicePath) throws IOException { */ private void spliceaiParser(Path spliceaiPath) throws IOException { // Check output folder: MMSplice - Path spliceaiOutFolder = fileSerializer.getOutdir().resolve(EtlCommons.SPLICEAI_SUBDIRECTORY); + Path spliceaiOutFolder = fileSerializer.getOutdir().resolve(SPLICEAI_DATA); if (!spliceaiOutFolder.toFile().exists()) { spliceaiOutFolder.toFile().mkdirs(); } @@ -292,7 +295,7 @@ private void spliceaiParser(Path spliceaiPath) throws IOException { } // Dump rocksDB to JSON file - dumpRocksDB(EtlCommons.SPLICEAI_SUBDIRECTORY + "/splice_score_spliceai_chr", rocksDB); + dumpRocksDB(SPLICEAI_DATA + "/splice_score_spliceai_chr", rocksDB); // Clean up rocksDB.close(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 7ac8bcf800..b9d784762a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -52,9 +52,9 @@ public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}"; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Ok ({})"; protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; - protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}/{}"; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Ok ({}/{})"; protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; @@ -259,6 +259,8 @@ protected void saveDataSource(String data, String version, String date, List downloadPredictionScores() throws IOException, CellBas return manager.download(); } + public List downloadSpliceScores() throws IOException, CellBaseException, InterruptedException { + SpliceScoreDownloadManager manager = new SpliceScoreDownloadManager(species, assembly, outputDirectory, configuration); + return manager.download(); + } + public List downloadPubMed() throws IOException, CellBaseException, InterruptedException { PubMedDownloadManager manager = new PubMedDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java new file mode 100644 index 0000000000..7c0f1c0c94 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java @@ -0,0 +1,67 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class SpliceScoreDownloadManager extends AbstractDownloadManager { + + public SpliceScoreDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, outdir, configuration); + } + + @Override + public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(SPLICE_SCORE_DATA)); + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{} not supported for the species {}", getDataName(SPLICE_SCORE_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + // Create splice score directory + Path spliceScorePath = downloadFolder.resolve(SPLICE_SCORE_DATA).toAbsolutePath(); + Files.createDirectories(spliceScorePath); + + // SpliceAI + saveSpliceScoreSource(SPLICEAI_DATA, configuration.getDownload().getSpliceAi(), spliceScorePath); + + // MMSplice + saveSpliceScoreSource(MMSPLICE_DATA, configuration.getDownload().getMmSplice(), spliceScorePath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(SPLICE_SCORE_DATA)); + return Collections.emptyList(); + } + + private void saveSpliceScoreSource(String data, DownloadProperties.URLProperties props, Path spliceScorePath) + throws CellBaseException, IOException { + logger.warn("{} files must be downloaded manually !", getDataName(data)); + saveDataSource(data, props.getVersion(), getTimeStamp(), Collections.singletonList(props.getHost()), + spliceScorePath.resolve(getDataVersionFilename(data))); + } +} From c1314596d8f284c46fbe37f13764fb7c2ce2b0b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Sat, 11 May 2024 08:28:37 +0200 Subject: [PATCH 073/148] lib: remove deprecated functions, #TASK-5575, #TASK-5564 --- .../lib/download/AbstractDownloadManager.java | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index b9d784762a..2d7c5da58a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -162,27 +162,6 @@ protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLPropertie return downloadFile; } - @Deprecated - protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, - String versionFilename, Path outPath) - throws IOException, InterruptedException, CellBaseException { - return downloadAndSaveDataSource(props, fileId, name, category, null, versionFilename, outPath); - } - - @Deprecated - protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, - String chromosome, String versionFilename, Path outPath) - throws IOException, InterruptedException, CellBaseException { - // Download file - DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); - - // Save data source - saveDataSource(name, category, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - outPath.resolve(versionFilename)); - - return downloadFile; - } - protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, Path outPath) throws IOException, InterruptedException, CellBaseException { return downloadAndSaveEnsemblDataSource(ensemblProps, fileId, data, null, outPath); @@ -201,20 +180,6 @@ protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.Ensem return downloadFile; } - @Deprecated - protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String name, - String category, String chromosome, String versionFilename, Path outPath) - throws IOException, InterruptedException, CellBaseException { - // Download file - DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); - - // Save data source - saveDataSource(name, category, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - outPath.resolve(versionFilename)); - - return downloadFile; - } - protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, Path outPath) throws IOException, InterruptedException, CellBaseException { return downloadDataSource(props, fileId, null, outPath); @@ -263,19 +228,6 @@ protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) - throws IOException { - DataSource dataSource = new DataSource(name, category, version, date, urls); - - if (StringUtils.isEmpty(version)) { - logger.warn("Version missing for data source {}/{}, using the date as version: {}", category, name, date); - dataSource.setVersion(date); - } - - dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); - } - protected String getTimeStamp() { return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); } @@ -404,20 +356,6 @@ private String getEnsemblURL(SpeciesConfiguration sp) { return configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); } } - - @Deprecated - protected String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { - if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); - } - String filesValue = props.getFiles().get(fileId); - if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { - return filesValue; - } else { - return props.getHost() + filesValue; - } - } } From a8a047c546752beba7253e576779fdbcc719b927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 16 May 2024 14:31:54 +0200 Subject: [PATCH 074/148] lib: improve gene downloader by taking into account the manually downloaded files and creating the data source version files, #TASK-5575, #TASK-5564 --- .../core/config/DownloadProperties.java | 10 +++ .../src/main/resources/configuration.yml | 21 +++++- .../org/opencb/cellbase/lib/EtlCommons.java | 66 +++++++++++++++---- .../lib/download/AbstractDownloadManager.java | 4 +- .../lib/download/GeneDownloadManager.java | 28 +++++--- 5 files changed, 102 insertions(+), 27 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index e7564a0b30..7d9adfac3c 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -54,6 +54,7 @@ public class DownloadProperties { private URLProperties disgenet; private URLProperties disgenetReadme; private URLProperties dgidb; + private URLProperties cancerGeneCensus; private URLProperties gwasCatalog; private URLProperties dbsnp; private URLProperties cadd; @@ -309,6 +310,15 @@ public DownloadProperties setDgidb(URLProperties dgidb) { return this; } + public URLProperties getCancerGeneCensus() { + return cancerGeneCensus; + } + + public DownloadProperties setCancerGeneCensus(URLProperties cancerGeneCensus) { + this.cancerGeneCensus = cancerGeneCensus; + return this; + } + public URLProperties getGwasCatalog() { return gwasCatalog; } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 088a2d3daa..747bf94a7c 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -63,6 +63,17 @@ download: REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + # To be generated manually + DESCRIPTION: "manual@description.txt" + # To be generated manually + XREFS: "manual@xrefs.txt" + # To be downloaded manually + HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" + # To be downloaded manually + TSO500: "manual@TSO500_transcripts.txt" + # To be downloaded manually + CANONICAL: "manual@ensembl_canonical.txt" + ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -116,8 +127,10 @@ download: GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz hpo: ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations/ version: "2024-04-26" - host: https://hpo.jax.org/app/data/annotations + files: + HPO: "manual@phenotype_to_genes.txt" disgenet: host: https://www.disgenet.org/ version: "7.0 (January 2020)" @@ -132,6 +145,12 @@ download: host: http://geneontology.org/ files: GO_ANNOTATION: gene-associations/goa_human.gaf.gz + cancerGeneCensus: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/census/ + version: "v99" + files: + CANCER_GENE_CENSUS: "manual@cancer-gene-census.tsv" ## Regulation mirbase: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index aed87ae3e1..1e61c44ffa 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -45,6 +45,18 @@ public final class EtlCommons { // Commons + public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; + public static final String HSAPIENS_NAME= "hsapiens"; + + public static final String GRCH38_NAME = "GRCh38"; + public static final String GRCH37_NAME = "GRCh37"; + public static final String HG38_NAME = "hg38"; + public static final String HG19_NAME = "hg19"; + + public static final String MANUAL_PREFIX = "manual@"; + + public static final String SUFFIX_VERSION_FILENAME = "Version.json"; + public static final String XLSX_EXTENSION = ".xlsx"; public static final String CSV_EXTENSION = ".csv"; public static final String TBI_EXTENSION = ".tbi"; @@ -67,16 +79,11 @@ public final class EtlCommons { public static final String ENSEMBL_REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; public static final String ENSEMBL_MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; - - public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; - public static final String HSAPIENS_NAME= "hsapiens"; - - public static final String GRCH38_NAME = "GRCh38"; - public static final String GRCH37_NAME = "GRCh37"; - public static final String HG38_NAME = "hg38"; - public static final String HG19_NAME = "hg19"; - - public static final String SUFFIX_VERSION_FILENAME = "Version.json"; + public static final String ENSEMBL_DESCRIPTION_FILE_ID = "DESCRIPTION"; + public static final String ENSEMBL_XREFS_FILE_ID = "XREFS"; + public static final String ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID = "HAEM_ONC_TRANSCRIPTS"; + public static final String ENSEMBL_TSO500_FILE_ID = "TSO500"; + public static final String ENSEMBL_CANONICAL_FILE_ID = "CANONICAL"; // Genome public static final String GENOME_DATA = "genome"; @@ -128,7 +135,9 @@ public final class EtlCommons { // - Gene Disease Annotation public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; // - HPO - public static final String HPO_DATA = "hpo"; + public static final String HPO_DISEASE_DATA = "hpo_disease"; + // Must match the configuration file + public static final String HPO_FILE_ID = "HPO"; // - DISGENET public static final String DISGENET_DATA = "disgenet"; // Must match the configuration file @@ -141,6 +150,10 @@ public final class EtlCommons { public static final String GO_ANNOTATION_DATA = "go_annotation"; // Must match the configuration file public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; + // - Cancer Gene Census + public static final String CANCER_GENE_CENSUS_DATA = "cancer_gene_census"; + // Must match the configuration file + public static final String CANCER_GENE_CENSUS_FILE_ID = "CANCER_GENE_CENSUS"; public static final String VARIATION_DATA = "variation"; @@ -343,10 +356,11 @@ public final class EtlCommons { dataNamesMap.put(UNIPROT_XREF_DATA, "UniProt Xref"); dataNamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "Gene Expression Atlas"); dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); - dataNamesMap.put(HPO_DATA, "HPO"); + dataNamesMap.put(HPO_DISEASE_DATA, "HPO Disease"); dataNamesMap.put(DISGENET_DATA, "DisGeNet"); dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraint"); dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); + dataNamesMap.put(CANCER_GENE_CENSUS_DATA, "Cancer Gene Census"); dataNamesMap.put(PROTEIN_DATA, "Protein"); dataNamesMap.put(UNIPROT_DATA, "UniProt"); dataNamesMap.put(INTERPRO_DATA, "InterPro"); @@ -397,10 +411,11 @@ public final class EtlCommons { dataCategoriesMap.put(DGIDB_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); dataCategoriesMap.put(UNIPROT_XREF_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); dataCategoriesMap.put(GENE_EXPRESSION_ATLAS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); - dataCategoriesMap.put(HPO_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HPO_DISEASE_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); dataCategoriesMap.put(DISGENET_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); dataCategoriesMap.put(GNOMAD_CONSTRAINTS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); dataCategoriesMap.put(GO_ANNOTATION_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(CANCER_GENE_CENSUS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); dataCategoriesMap.put(UNIPROT_DATA, dataNamesMap.get(PROTEIN_DATA)); dataCategoriesMap.put(INTERPRO_DATA, dataNamesMap.get(PROTEIN_DATA)); dataCategoriesMap.put(INTACT_DATA, dataNamesMap.get(PROTEIN_DATA)); @@ -440,10 +455,11 @@ public final class EtlCommons { dataVersionFilenamesMap.put(DGIDB_DATA, "dgidb" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(UNIPROT_XREF_DATA, "uniProtXref" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME); - dataVersionFilenamesMap.put(HPO_DATA, "hpo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_DISEASE_DATA, "hpoDisease" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(DISGENET_DATA, "disGeNet" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomadConstraints" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(GO_ANNOTATION_DATA, "goAnnotation" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CANCER_GENE_CENSUS_DATA, "cancerGeneCensus" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(UNIPROT_DATA, "uniProt" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(INTERPRO_DATA, "interPro" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(INTACT_DATA, "intAct" + SUFFIX_VERSION_FILENAME); @@ -673,4 +689,26 @@ public static String getDataVersionFilename(String data) throws CellBaseExceptio public static List getUrls(List downloadFiles) { return downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()); } + + public static List getManualUrls(DownloadProperties.URLProperties props) { + List urls = new ArrayList<>(); + for (String value : props.getFiles().values()) { + String url = getManualUrl(props.getHost(), value); + if (StringUtils.isNotEmpty(url)) { + urls.add(url); + } + } + return urls; + } + + public static String getManualUrl(DownloadProperties.URLProperties props, String fileId) { + return getManualUrl(props.getHost(), props.getFiles().get(fileId)); + } + + public static String getManualUrl(String host, String file) { + if (file.startsWith(MANUAL_PREFIX)) { + return MANUAL_PREFIX + host + file.replace(MANUAL_PREFIX, ""); + } + return null; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 2d7c5da58a..a87faeb611 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -52,9 +52,9 @@ public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Ok ({})"; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done."; protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; - protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Ok ({}/{})"; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done."; protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index ee332dd8ea..06f568d103 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -83,6 +83,20 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadGO(geneDownloadPath)); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + // Save data sources manually downloaded + // HPO + saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), + geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), + getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + // Cancer gene census + saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)), + geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), + getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); return downloadFiles; @@ -102,7 +116,10 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_CDNA_FA_FILE_ID, ensemblDownloadPath)); // Save data source (i.e., metadata) - saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), getUrls(downloadFiles), + List urls = getUrls(downloadFiles); + // Add manually downloaded files + urls.addAll(getManualUrls(ensemblProps.getUrl())); + saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), urls, ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); @@ -226,15 +243,6 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); - // HPO - // IMPORTANT !!! - logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", - getDataName(HPO_DATA), configuration.getDownload().getHpo().getHost(), getDataVersionFilename(HPO_DATA), - getDataCategory(HPO_DATA), getDataName(HPO_DATA)); - saveDataSource(HPO_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), - Collections.singletonList(configuration.getDownload().getHpo().getHost()), - geneDownloadPath.resolve(getDataVersionFilename(HPO_DATA))); - // DisGeNet DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_DATA, geneDownloadPath); From 100d6f3c6120d08b1fbfccb582ff4fe94b03bd4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 17 May 2024 11:10:42 +0200 Subject: [PATCH 075/148] lib: update gene builder (Ensembl/RefSeq) according to last changes, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 3 +- .../lib/builders/CellBaseBuilder.java | 39 +++++++++++++++++-- .../lib/builders/EnsemblGeneBuilder.java | 37 ++++++++++-------- .../builders/EnsemblGeneBuilderIndexer.java | 18 +++++---- .../cellbase/lib/builders/GeneBuilder.java | 11 ++++-- .../lib/builders/RefSeqGeneBuilder.java | 36 ++++++++++------- .../builders/RefSeqGeneBuilderIndexer.java | 6 +-- .../lib/builders/EnsemblGeneBuilderTest.java | 2 +- .../lib/builders/RefSeqGeneBuilderTest.java | 5 ++- 9 files changed, 105 insertions(+), 52 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 0c342782cc..87506c53e0 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -242,7 +242,8 @@ private CellBaseBuilder buildGenomeSequence() throws CellBaseException { } private CellBaseBuilder buildGene() throws CellBaseException { - return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing); + return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing, + configuration); } private CellBaseBuilder buildCadd() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index fe1b5fe648..eeb91729a5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -55,13 +55,13 @@ public abstract class CellBaseBuilder { public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!"; public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; - public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!"; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building done."; public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; - public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done!"; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done."; public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; - public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!"; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done."; public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); @@ -82,8 +82,39 @@ public void disconnect() { } } + protected File checkFile(DownloadProperties.URLProperties props, String fileId, Path targetPath, String name) throws CellBaseException { + logger.info("Checking file {} (file ID {} in config.) ...", name, fileId); + String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); + if (filename.contains(MANUAL_PREFIX)) { + filename = filename.replace(MANUAL_PREFIX, ""); + } + Path filePath = targetPath.resolve(filename); + if (!Files.exists(filePath)) { + if (filename.contains(PUT_CAPITAL_SPECIES_HERE_MARK)) { + // Check + filename = filename.replace(PUT_CAPITAL_SPECIES_HERE_MARK + "." + PUT_ASSEMBLY_HERE_MARK + "." + PUT_RELEASE_HERE_MARK, "") + .replace(PUT_CAPITAL_SPECIES_HERE_MARK + "." + PUT_ASSEMBLY_HERE_MARK, ""); + boolean found = false; + for (File file : targetPath.toFile().listFiles()) { + if (file.getName().endsWith(filename)) { + filePath = file.toPath(); + found = true; + } + } + if (!found) { + throw new CellBaseException("Expected " + name + " file (configuration file ID = " + fileId + ") does not exist at " + + targetPath); + } + } else { + throw new CellBaseException("Expected " + name + " file: " + filename + " does not exist at " + targetPath); + } + } + logger.info("Ok."); + return filePath.toFile(); + } + protected File checkFile(String data, DownloadProperties.URLProperties props, String fileId, Path targetPath) throws CellBaseException { - logger.info("Checking file {}/{} ...", getDataName(data), fileId); + logger.info("Checking file {} (file ID {} in config.) ...", getDataName(data), fileId); if (!props.getFiles().containsKey(fileId)) { throw new CellBaseException("File ID " + fileId + " does not exist in the configuration file in the section '" + data + "'"); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index d6b935fa52..4b3f43e255 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -26,6 +26,7 @@ import org.opencb.biodata.tools.sequence.FastaIndex; import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; @@ -92,12 +93,13 @@ public class EnsemblGeneBuilder extends CellBaseBuilder { private Gtf nextGtfToReturn; public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseSerializer serializer) { + CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); this.downloadPath = downloadPath; this.speciesConfiguration = speciesConfiguration; this.flexibleGTFParsing = flexibleGTFParsing; + this.configuration = configuration; transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); @@ -122,14 +124,17 @@ public void check() throws Exception { } // Check Ensembl files - List files = checkFiles(ensemblGeneLabel, ENSEMBL_DATA, downloadPath, 3); - gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); - proteinFastaFile = files.stream().filter(f -> f.getName().contains(".pep.all.fa")).findFirst().get().toPath(); - cDnaFastaFile = files.stream().filter(f -> f.getName().contains(".cdna.all.fa")).findFirst().get().toPath(); - - // Check common files - // geneDescriptionFile = - // xrefsFile = + DownloadProperties.URLProperties props = configuration.getDownload().getEnsembl().getUrl(); + gtfFile = checkFile(props, ENSEMBL_GTF_FILE_ID, downloadPath, "Ensembl GTF").toPath(); + proteinFastaFile = checkFile(props, ENSEMBL_PEP_FA_FILE_ID, downloadPath, "Ensembl Protein Fasta").toPath(); + cDnaFastaFile = checkFile(props, ENSEMBL_CDNA_FA_FILE_ID, downloadPath, "Ensembl CDNA Fasta").toPath(); + + // Commons + geneDescriptionFile = checkFile(props, ENSEMBL_DESCRIPTION_FILE_ID, downloadPath.getParent(), "Ensembl Description").toPath(); + xrefsFile = checkFile(props, ENSEMBL_XREFS_FILE_ID, downloadPath.getParent(), "Ensembl Xrefs").toPath(); + ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath(); + tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); + eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); @@ -137,18 +142,16 @@ public void check() throws Exception { geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); - // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); - // ensemblCanonicalFile = ; - // cancerGeneCensus = - // tso500File = - // eglhHaemOncFile = + cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); // Check regulation files // Motif features - files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 2); + List files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), + 2); if (files.get(0).getName().endsWith("tbi")) { tabixFile = files.get(0).toPath(); tfbsFile = files.get(1).toPath(); @@ -177,7 +180,9 @@ public void check() throws Exception { } miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); if (!Files.exists(miRTarBaseFile)) { - throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist. You" + + " have to export the file " + mirTarBaseFiles.get(0) + " to " + miRTarBaseFile.getFileName() + " format separated by" + + " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh"); } // Check genome fasta file diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index 10f54e2ea1..0b102d015c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -74,8 +74,8 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { -// indexDescriptions(geneDescriptionFile); -// indexXrefs(xrefsFile, uniprotIdMappingFile); + indexDescriptions(geneDescriptionFile); + indexXrefs(xrefsFile, uniprotIdMappingFile); indexHgncIdMapping(hgncFile); indexManeMapping(maneFile, ENSEMBL_DATA); indexLrgMapping(lrgFile, ENSEMBL_DATA); @@ -88,15 +88,15 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexOntologyAnnotations(geneOntologyAnnotationFile); indexMiRBase(species, miRBaseFile); indexMiRTarBase(miRTarBaseFile); -// indexCancerGeneCensus(cancerGeneGensusFile); + indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); -// indexCanonical(canonicalFile); -// indexTSO500(tso500File); -// indexEGLHHaemOnc(eglhHaemOncFile); + indexCanonical(canonicalFile); + indexTSO500(tso500File); + indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { - logger.info("Loading gene description data..."); + logger.info(PARSING_LOG_MESSAGE, geneDescriptionFile); String[] fields; if (geneDescriptionFile != null && Files.exists(geneDescriptionFile) && Files.size(geneDescriptionFile) > 0) { List lines = Files.readAllLines(geneDescriptionFile, StandardCharsets.ISO_8859_1); @@ -108,6 +108,7 @@ private void indexDescriptions(Path geneDescriptionFile) throws IOException, Roc logger.warn("Gene description file " + geneDescriptionFile + " not found"); logger.warn("Gene description data not loaded"); } + logger.info(PARSING_DONE_LOG_MESSAGE); } public String getDescription(String id) throws RocksDBException { @@ -120,7 +121,7 @@ public String getDescription(String id) throws RocksDBException { } private void indexXrefs(Path xrefsFile, Path uniprotIdMappingFile) throws IOException, RocksDBException { - logger.info("Loading xref data..."); + logger.info(PARSING_LOG_MESSAGE, xrefsFile); String[] fields; if (xrefsFile != null && Files.exists(xrefsFile) && Files.size(xrefsFile) > 0) { List lines = Files.readAllLines(xrefsFile, StandardCharsets.ISO_8859_1); @@ -182,6 +183,7 @@ private void indexXrefs(Path xrefsFile, Path uniprotIdMappingFile) throws IOExce logger.warn("Uniprot if mapping file " + uniprotIdMappingFile + " not found"); logger.warn("Protein mapping into xref data not loaded"); } + logger.info(PARSING_DONE_LOG_MESSAGE); } public List getXrefs(String id) throws RocksDBException, IOException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index 970f73e05a..43a654f2da 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -16,6 +16,7 @@ package org.opencb.cellbase.lib.builders; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; @@ -29,7 +30,8 @@ public class GeneBuilder extends CellBaseBuilder { private EnsemblGeneBuilder ensemblGeneBuilder; private RefSeqGeneBuilder refSeqGeneBuilder; - public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing) + public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, + CellBaseConfiguration configuration) throws CellBaseException { super(null); @@ -37,12 +39,13 @@ public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speci CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA), ENSEMBL_GENE_BASENAME); this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, - ensemblGeneSerializer); + configuration, ensemblGeneSerializer); // Create RefSeq gene builder CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA), REFSEQ_GENE_BASENAME); - this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, refSeqGeneSerializer); + this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, configuration, + refSeqGeneSerializer); } public void check() throws Exception { @@ -60,7 +63,7 @@ public void parse() throws Exception { // Check folders and files before building check(); - // Build Ensembl/RefSeq genes +// // Build Ensembl/RefSeq genes ensemblGeneBuilder.parse(); refSeqGeneBuilder.parse(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 8f03a801f2..b8b273b322 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -22,13 +22,14 @@ import org.opencb.biodata.models.core.*; import org.opencb.biodata.tools.sequence.FastaIndex; import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.rocksdb.RocksDBException; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -41,6 +42,7 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { private Path downloadPath; + private CellBaseConfiguration configuration; private Map transcriptDict; private Map exonDict; @@ -54,7 +56,7 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { private Path hpoFile; private Path geneDrugFile; private Path miRTarBaseFile; - private Path cancerGeneCensus; + private Path cancerGeneCensusFile; private Path cancerHotspot; private Path tso500File; private Path eglhHaemOncFile; @@ -69,11 +71,13 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; - public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration, + CellBaseSerializer serializer) { super(serializer); this.downloadPath = downloadPath; this.speciesConfiguration = speciesConfiguration; + this.configuration = configuration; transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); @@ -98,22 +102,24 @@ public void check() throws Exception { } // Check RefSeq files - List files = checkFiles(refSeqGeneLabel, REFSEQ_DATA, downloadPath, 4); - gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); - proteinFastaFile = files.stream().filter(f -> f.getName().contains("_protein")).findFirst().get().toPath(); - cdnaFastaFile = files.stream().filter(f -> f.getName().contains("_rna")).findFirst().get().toPath(); - fastaFile = files.stream().filter(f -> f.getName().contains("_genomic.fna")).findFirst().get().toPath(); + DownloadProperties.URLProperties props = configuration.getDownload().getRefSeq(); + gtfFile = checkFile(props, REFSEQ_GENOMIC_GTF_FILE_ID, downloadPath, "RefSeq GTF").toPath(); + proteinFastaFile = checkFile(props, REFSEQ_PROTEIN_FAA_FILE_ID, downloadPath, "RefSeq Protein FAA").toPath(); + cdnaFastaFile = checkFile(props, REFSEQ_RNA_FNA_FILE_ID, downloadPath, "RefSeq RNA FNA").toPath(); + fastaFile = checkFile(props, REFSEQ_GENOMIC_FNA_FILE_ID, downloadPath, "RefSeq Genomic FNA").toPath(); // Check common files + props = configuration.getDownload().getEnsembl().getUrl(); + tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); + eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); - // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); - // cancerGeneCensus = ; - // tso500File = ; - // eglhHaemOncFile = ; + cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); // Check regulation files // mirtarbase @@ -134,7 +140,9 @@ public void check() throws Exception { } miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); if (!Files.exists(miRTarBaseFile)) { - throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist. You" + + " have to export the file " + mirTarBaseFiles.get(0) + " to " + miRTarBaseFile.getFileName() + " format separated by" + + " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh"); } logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); @@ -154,7 +162,7 @@ public void parse() throws Exception { logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, - cancerGeneCensus, cancerHotspot, tso500File, eglhHaemOncFile); + cancerGeneCensusFile, cancerHotspot, tso500File, eglhHaemOncFile); logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); logger.info(PARSING_LOG_MESSAGE, gtfFile); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 9aae170ce2..8542e76b1c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -41,9 +41,9 @@ public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaF indexDrugs(geneDrugFile); indexDiseases(hpoFilePath, disgenetFile); indexMiRTarBase(miRTarBaseFile); -// indexCancerGeneCensus(cancerGeneGensus); + indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); -// indexTSO500(tso500File); -// indexEGLHHaemOnc(eglhHaemOncFile); + indexTSO500(tso500File); + indexEGLHHaemOnc(eglhHaemOncFile); } } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java index 63d1f445a8..b15925e7a3 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java @@ -15,7 +15,7 @@ public void testGeneBuilder() throws Exception { CellBaseConfiguration configuration = CellBaseConfiguration.load(Paths.get("/home/jtarraga/appl/cellbase/build/conf/configuration.yml")); SpeciesConfiguration speciesConfiguration = configuration.getSpeciesConfig("hsapiens"); - GeneBuilder geneBuilder = new GeneBuilder(downloadPath, buildPath, speciesConfiguration, flexibleGTFParsing); + GeneBuilder geneBuilder = new GeneBuilder(downloadPath, buildPath, speciesConfiguration, flexibleGTFParsing, configuration); geneBuilder.check(); geneBuilder.parse(); } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java index 806c096873..9ab36de70a 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java @@ -26,6 +26,7 @@ import org.junit.jupiter.api.TestInstance; import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.core.*; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -54,12 +55,14 @@ public RefSeqGeneBuilderTest() { public void init() throws Exception { try { Path geneDirectoryPath = Paths.get(RefSeqGeneBuilderTest.class.getResource("/gene_refseq").toURI()); + Path configurationPath = Paths.get(RefSeqGeneBuilderTest.class.getResource("configuration.test.yml").toURI()); // put the results in /tmp CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "refseq", true); SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", "human", null, null, null); - geneParser = new RefSeqGeneBuilder(geneDirectoryPath, species, serializer); + CellBaseConfiguration configuration = CellBaseConfiguration.load(configurationPath); + geneParser = new RefSeqGeneBuilder(geneDirectoryPath, species, configuration, serializer); geneParser.parse(); jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); From 0cd4b805b879e3a73d81e53f57e4ad3205e7a158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 27 May 2024 13:10:12 +0200 Subject: [PATCH 076/148] lib: udate Ensembl/RefSeq gene builder to gunzip FASTA files before building, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 2 +- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/builders/EnsemblGeneBuilder.java | 27 +++++++++++++++++-- .../lib/builders/RefSeqGeneBuilder.java | 24 ++++++++++++++++- 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 87506c53e0..c3853c4478 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -351,7 +351,7 @@ private Path getFastaReferenceGenome() throws CellBaseException { throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } } - fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(".gz", "")); + fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, "")); if (!fastaPath.toFile().exists()) { throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 1e61c44ffa..8b8bbe7075 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -61,6 +61,7 @@ public final class EtlCommons { public static final String CSV_EXTENSION = ".csv"; public static final String TBI_EXTENSION = ".tbi"; public static final String FAI_EXTENSION = ".fai"; + public static final String GZ_EXTENSION = ".gz"; public static final String OK_LOG_MESSAGE = "Ok."; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index 4b3f43e255..27098bcf14 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -31,6 +31,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDBException; import java.io.File; @@ -185,8 +186,30 @@ public void check() throws Exception { + " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh"); } - // Check genome fasta file - genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath(); + // Check genome FASTA file + Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA); + String genomeGzFilename = Paths.get(((DataSource) dataSourceReader.readValue(genomeDownloadPath + .resolve(getDataVersionFilename(GENOME_DATA)).toFile())).getUrls().get(0)).getFileName().toString(); + genomeSequenceFilePath = genomeDownloadPath.resolve(genomeGzFilename); + if (Files.exists(genomeSequenceFilePath)) { + // Need to be gunzip-ed + logger.info("Gunzip file: {}", genomeSequenceFilePath); + try { + EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(genomeSequenceFilePath.toString()), null); + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e); + } + } + String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, ""); + genomeSequenceFilePath = genomeDownloadPath.resolve(genomeFilename); + if (!Files.exists(genomeSequenceFilePath)) { + throw new CellBaseException("Genome FASTA file " + genomeSequenceFilePath.getFileName() + " does not exist at " + + genomeSequenceFilePath.getParent()); + } logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); checked = true; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index b8b273b322..b291d2f9cd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -28,6 +28,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDBException; import java.io.IOException; @@ -106,7 +107,28 @@ public void check() throws Exception { gtfFile = checkFile(props, REFSEQ_GENOMIC_GTF_FILE_ID, downloadPath, "RefSeq GTF").toPath(); proteinFastaFile = checkFile(props, REFSEQ_PROTEIN_FAA_FILE_ID, downloadPath, "RefSeq Protein FAA").toPath(); cdnaFastaFile = checkFile(props, REFSEQ_RNA_FNA_FILE_ID, downloadPath, "RefSeq RNA FNA").toPath(); - fastaFile = checkFile(props, REFSEQ_GENOMIC_FNA_FILE_ID, downloadPath, "RefSeq Genomic FNA").toPath(); + + // Check genome FASTA file + String genomeGzFilename = Paths.get(props.getFiles().get(REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString(); + fastaFile = downloadPath.resolve(genomeGzFilename); + if (Files.exists(fastaFile)) { + // Need to be gunzip-ed + logger.info("Gunzip file: {}", fastaFile); + try { + EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaFile.toString()), null); + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e); + } + } + String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, ""); + fastaFile = downloadPath.resolve(genomeFilename); + if (!Files.exists(fastaFile)) { + throw new CellBaseException("Genome FASTA file " + fastaFile.getFileName() + " does not exist at " + fastaFile.getParent()); + } // Check common files props = configuration.getDownload().getEnsembl().getUrl(); From fc65d14c14cfa166d7609c27fa4854aa351787fd Mon Sep 17 00:00:00 2001 From: imedina Date: Sun, 23 Jun 2024 01:41:30 +0100 Subject: [PATCH 077/148] lib: add hpo filter to GeneQuery --- .../opencb/cellbase/core/api/GeneQuery.java | 30 +++++++++---------- .../lib/impl/core/GeneMongoDBAdaptor.java | 1 + .../src/main/resources/mongodb-indexes.json | 1 + 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java index 1451fc213b..b987afa29f 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java @@ -42,12 +42,11 @@ public class GeneQuery extends AbstractQuery { @QueryParameter(id = "region") private List regions; - @QueryParameter(id = "transcripts.biotype", alias = {ParamConstants.TRANSCRIPT_BIOTYPES_PARAM, - "transcriptsBiotype"}) + @QueryParameter(id = "transcripts.biotype", alias = {ParamConstants.TRANSCRIPT_BIOTYPES_PARAM, "transcriptsBiotype"}) private List transcriptsBiotype; - @QueryParameter(id = "transcripts.xrefs.id", alias = {ParamConstants.TRANSCRIPT_XREFS_PARAM, "xrefs", "transcriptsXrefsId", - "transcripts.xrefs"}) + @QueryParameter(id = "transcripts.xrefs.id", + alias = {ParamConstants.TRANSCRIPT_XREFS_PARAM, "xrefs", "transcriptsXrefsId", "transcripts.xrefs"}) private List transcriptsXrefs; @QueryParameter(id = "transcripts.id", alias = {ParamConstants.TRANSCRIPT_IDS_PARAM, "transcriptsId"}) private List transcriptsId; @@ -61,22 +60,23 @@ public class GeneQuery extends AbstractQuery { private LogicalList transcriptsTfbsId; @QueryParameter(id = "transcripts.tfbs.pfmId", alias = {ParamConstants.TRANSCRIPT_TFBS_PFMIDS_PARAM, "transcriptsTfbsPfmId"}) private LogicalList transcriptsTfbsPfmId; - @QueryParameter(id = "transcripts.tfbs.transcriptionFactors", alias = {ParamConstants.TRANSCRIPT_TRANSCRIPTION_FACTORS_PARAM, - "transcriptsTfbsTranscriptionFactors"}) + @QueryParameter(id = "transcripts.tfbs.transcriptionFactors", + alias = {ParamConstants.TRANSCRIPT_TRANSCRIPTION_FACTORS_PARAM, "transcriptsTfbsTranscriptionFactors"}) private LogicalList transcriptsTfbsTranscriptionFactors; - @QueryParameter(id = ParamConstants.ONTOLOGY_PARAM, alias = {"transcripts.annotation.ontologies.id", - "transcripts.annotation.ontologies.name", "transcriptAnnotationOntologiesId"}) + @QueryParameter(id = ParamConstants.ONTOLOGY_PARAM, + alias = {"transcripts.annotation.ontologies.id", "transcripts.annotation.ontologies.name", "transcriptAnnotationOntologiesId"}) private LogicalList transcriptAnnotationOntologiesId; - @QueryParameter(id = ParamConstants.ANNOTATION_DISEASES_PARAM, alias = {"annotation.diseases.id", "annotation.diseases.name"}) + @QueryParameter(id = ParamConstants.ANNOTATION_DISEASES_PARAM, + alias = {"annotation.diseases.id", "annotation.diseases.name", "annotation.diseases.hpo"}) private LogicalList annotationDiseases; - @QueryParameter(id = "annotation.expression.tissue", alias = {ParamConstants.ANNOTATION_EXPRESSION_TISSUE_PARAM, - "annotationExpressionTissue"}) + @QueryParameter(id = "annotation.expression.tissue", + alias = {ParamConstants.ANNOTATION_EXPRESSION_TISSUE_PARAM, "annotationExpressionTissue"}) private LogicalList annotationExpressionTissue; - @QueryParameter(id = "annotation.expression.value", alias = {ParamConstants.ANNOTATION_EXPRESSION_VALUE_PARAM, - "annotationExpressionValue"}) + @QueryParameter(id = "annotation.expression.value", + alias = {ParamConstants.ANNOTATION_EXPRESSION_VALUE_PARAM, "annotationExpressionValue"}) private LogicalList annotationExpressionValue; - @QueryParameter(id = "annotation.drugs.drugName", alias = {ParamConstants.ANNOTATION_DRUGS_NAME_PARAM, "annotation.drugs.name", - "annotationDrugsName"}) + @QueryParameter(id = "annotation.drugs.drugName", + alias = {ParamConstants.ANNOTATION_DRUGS_NAME_PARAM, "annotation.drugs.name", "annotationDrugsName"}) private LogicalList annotationDrugsName; @QueryParameter(id = "constraints", alias = {ParamConstants.ANNOTATION_CONSTRAINTS_PARAM}) private LogicalList annotationConstraints; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/GeneMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/GeneMongoDBAdaptor.java index 94a686310b..9ee645fbe2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/GeneMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/GeneMongoDBAdaptor.java @@ -257,6 +257,7 @@ private void createDiseaseQuery(Object queryValues, List andBsonList) { List orBsonList = new ArrayList<>(); orBsonList.add(getLogicalListFilter(queryValues, "annotation.diseases.id")); orBsonList.add(getLogicalListFilter(queryValues, "annotation.diseases.name")); + orBsonList.add(getLogicalListFilter(queryValues, "annotation.diseases.hpo")); andBsonList.add(Filters.or(orBsonList)); } } diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index de81c7b83b..965effb194 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -43,6 +43,7 @@ {"collection": "gene", "fields": {"mirna.matures.id": 1}, "options": {"background": true, "sparse": true}} {"collection": "gene", "fields": {"annotation.diseases.id": 1}, "options": {"background": true}} {"collection": "gene", "fields": {"annotation.diseases.name": 1}, "options": {"background": true}} +{"collection": "gene", "fields": {"annotation.diseases.hpo": 1}, "options": {"background": true}} {"collection": "gene", "fields": {"annotation.expression.expression": 1}, "options": {"background": true}} {"collection": "gene", "fields": {"annotation.expression.factorValue": 1}, "options": {"background": true}} {"collection": "gene", "fields": {"annotation.drugs.drugName": 1}, "options": {"background": true}} From 84ad97bfa68b7d6a66730d9c3dd4c932095a68a6 Mon Sep 17 00:00:00 2001 From: imedina Date: Tue, 2 Jul 2024 03:38:27 +0100 Subject: [PATCH 078/148] Many improvements and fixes: * Support multiple species * remove sharding code * remove 'instal' command * launch genome_info.pl in the genome_info * update mouse version * many other... --- .../cloud/docker/cellbase-builder/Dockerfile | 7 +- .../app/scripts/ensembl-scripts/DB_CONFIG.pm | 6 +- .../ensembl-scripts/gene_extra_info.pl | 6 +- .../scripts/ensembl-scripts/genome_info.pl | 2 + .../app/cli/admin/AdminCliOptionsParser.java | 21 +- .../cellbase/app/cli/admin/AdminMain.java | 17 +- .../executors/DownloadCommandExecutor.java | 81 +++---- .../executors/InstallCommandExecutor.java | 44 ---- .../core/config/CellBaseConfiguration.java | 54 +---- .../core/config/DatabaseCredentials.java | 26 +-- .../cellbase/core/config/Databases.java | 24 +-- .../config/MongoDBDatabaseCredentials.java | 98 --------- .../core/config/SpeciesConfiguration.java | 137 +++--------- .../cellbase/core/utils/SpeciesUtils.java | 47 +++- .../src/main/resources/configuration.yml | 185 ++++++++-------- .../src/test/resources/configuration.yml | 38 ---- .../org/opencb/cellbase/lib/EtlCommons.java | 6 + .../lib/builders/CellBaseBuilder.java | 13 +- .../lib/builders/EnsemblGeneBuilder.java | 31 +-- .../lib/builders/RefSeqGeneBuilder.java | 30 +-- .../cellbase/lib/db/MongoDBManager.java | 10 +- .../lib/download/AbstractDownloadManager.java | 15 +- .../lib/download/CaddDownloadManager.java | 25 +-- .../lib/download/ClinicalDownloadManager.java | 95 ++++----- .../download/ConservationDownloadManager.java | 156 ++++++++++++++ .../cellbase/lib/download/DownloadFile.java | 30 +-- .../cellbase/lib/download/Downloader.java | 26 ++- .../lib/download/GeneDownloadManager.java | 180 +++++++++------- .../lib/download/GenomeDownloadManager.java | 200 ------------------ .../MissenseScoresDownloadManager.java | 39 ++-- .../lib/download/OntologyDownloadManager.java | 80 +++---- .../lib/download/PharmGKBDownloadManager.java | 46 ++-- .../lib/download/ProteinDownloadManager.java | 44 ++-- .../download/RegulationDownloadManager.java | 40 ++-- .../download/SpliceScoreDownloadManager.java | 4 +- .../cellbase/lib/install/InstallManager.java | 76 ------- .../lib/install/MongoDBShardUtils.java | 148 ------------- .../lib/builders/EnsemblGeneBuilderTest.java | 3 +- .../lib/builders/GeneBuilderTest.java | 12 +- .../lib/builders/RefSeqGeneBuilderTest.java | 3 +- 40 files changed, 813 insertions(+), 1292 deletions(-) delete mode 100644 cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java delete mode 100644 cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/InstallManager.java delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/MongoDBShardUtils.java diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index 6e1657d1bf..5235637267 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -11,7 +11,7 @@ LABEL org.label-schema.vendor="OpenCB" \ ## We need to be root to install dependencies USER root RUN apt-get update -y && \ - apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl && \ + apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl libxml-simple-perl liblog-log4perl-perl libxml-parser-perl libxml-dom-perl && \ mkdir /opt/ensembl && chown cellbase:cellbase /opt/ensembl && \ rm -rf /var/lib/apt/lists/* @@ -26,6 +26,7 @@ RUN cd /opt/ensembl && \ git clone https://github.com/Ensembl/ensembl-variation.git && \ git clone https://github.com/Ensembl/ensembl-funcgen.git && \ git clone https://github.com/Ensembl/ensembl-compara.git && \ - git clone https://github.com/Ensembl/ensembl-io.git + git clone https://github.com/Ensembl/ensembl-io.git && \ + git clone --branch cvs/release-0_7 https://github.com/biomart/biomart-perl -ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts +ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index b0edf65793..90f2f8208e 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -141,9 +141,9 @@ our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_111_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; -our $MUS_MUSCULUS_CORE = "mus_musculus_core_78_38"; -our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_78_38"; -our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_78_38"; +our $MUS_MUSCULUS_CORE = "mus_musculus_core_111_39"; +our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_111_39"; +our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_111_39"; our $RATTUS_NORVEGICUS_CORE = "rattus_norvegicus_core_78_5"; our $RATTUS_NORVEGICUS_VARIATION = "rattus_norvegicus_variation_78_5"; our $RATTUS_NORVEGICUS_FUNCTIONAL = "rattus_norvegicus_funcgen_78_5"; diff --git a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl index 5e3aa9c46a..d227b4c1c5 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl @@ -16,6 +16,8 @@ #################################################################### ## Parsing command line options #################################### #################################################################### +##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp + # USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ... ## Parsing command line @@ -50,8 +52,8 @@ if ($phylo eq "" || $phylo eq "vertebrate") { print ("In vertebrates section\n"); - if ($species eq "Homo sapiens" && $assembly eq "GRCh38") { - print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n"); + if ($species eq "Homo sapiens" || $species eq "Mus musculus") { + print ($species." selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n"); Bio::EnsEMBL::Registry->load_registry_from_db( -host => $ENSEMBL_HOST, -user => $ENSEMBL_USER, diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl index 50520f1f92..e5ecd61c33 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl @@ -17,6 +17,8 @@ #################################################################### ## Parsing command line options #################################### #################################################################### +##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl --species "Mus musculus" --outfile /tmp + # USAGE: ./genome_info.pl --species "Homo sapiens" --outfile ../../appl_db/ird_v1/hsa ... ## Parsing command line diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 17341eb8c6..ebf647f91b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -17,20 +17,16 @@ package org.opencb.cellbase.app.cli.admin; import com.beust.jcommander.*; -import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import static org.opencb.cellbase.lib.EtlCommons.*; -/** - * Created by imedina on 03/02/15. - */ + public class AdminCliOptionsParser extends CliOptionsParser { private final CommonCommandOptions commonCommandOptions; @@ -44,7 +40,6 @@ public class AdminCliOptionsParser extends CliOptionsParser { private ExportCommandOptions exportCommandOptions; private CustomiseCommandOptions customiseCommandOptions; private IndexCommandOptions indexCommandOptions; - private InstallCommandOptions installCommandOptions; private ServerCommandOptions serverCommandOptions; private ValidationCommandOptions validationCommandOptions; @@ -61,7 +56,6 @@ public AdminCliOptionsParser() { exportCommandOptions = new ExportCommandOptions(); customiseCommandOptions = new CustomiseCommandOptions(); indexCommandOptions = new IndexCommandOptions(); - installCommandOptions = new InstallCommandOptions(); serverCommandOptions = new ServerCommandOptions(); validationCommandOptions = new ValidationCommandOptions(); @@ -73,7 +67,6 @@ public AdminCliOptionsParser() { jCommander.addCommand("export", exportCommandOptions); jCommander.addCommand("customise", customiseCommandOptions); jCommander.addCommand("index", indexCommandOptions); - jCommander.addCommand("install", installCommandOptions); jCommander.addCommand("server", serverCommandOptions); jCommander.addCommand("validate", validationCommandOptions); } @@ -322,16 +315,6 @@ public class IndexCommandOptions { public boolean validate; } - @Parameters(commandNames = {"install"}, commandDescription = "Set up sharding for CellBase") - public class InstallCommandOptions { - - @ParametersDelegate - public CommonCommandOptions commonOptions = commonCommandOptions; - - @ParametersDelegate - public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - } - @Parameters(commandNames = {"server"}, commandDescription = "Manage REST server") public class ServerCommandOptions { @@ -425,8 +408,6 @@ public IndexCommandOptions getIndexCommandOptions() { return indexCommandOptions; } - public InstallCommandOptions getInstallCommandOptions() { return installCommandOptions; } - public ServerCommandOptions getServerCommandOptions() { return serverCommandOptions; } public ValidationCommandOptions getValidationCommandOptions() { return validationCommandOptions; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index fecf57c08a..06030ec485 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -25,9 +25,7 @@ import java.io.IOException; import java.net.URISyntaxException; -/** - * Created by imedina on 03/02/15. - */ + public class AdminMain { public static void main(String[] args) { @@ -63,30 +61,27 @@ public static void main(String[] args) { case "build": commandExecutor = new BuildCommandExecutor(cliOptionsParser.getBuildCommandOptions()); break; + case "load": + commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions()); + break; case "data-release": commandExecutor = new DataReleaseCommandExecutor(cliOptionsParser.getDataReleaseCommandOptions()); break; case "api-key": commandExecutor = new ApiKeyCommandExecutor(cliOptionsParser.getApiKeyCommandOptions()); break; - case "load": - commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions()); - break; case "export": commandExecutor = new ExportCommandExecutor(cliOptionsParser.getExportCommandOptions()); break; case "index": commandExecutor = new IndexCommandExecutor(cliOptionsParser.getIndexCommandOptions()); break; - case "install": - commandExecutor = new InstallCommandExecutor(cliOptionsParser.getInstallCommandOptions()); + case "validate": + commandExecutor = new ValidationCommandExecutor(cliOptionsParser.getValidationCommandOptions()); break; case "server": commandExecutor = new ServerCommandExecutor(cliOptionsParser.getServerCommandOptions()); break; - case "validate": - commandExecutor = new ValidationCommandExecutor(cliOptionsParser.getValidationCommandOptions()); - break; default: break; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f309b22041..8718bb29a9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -19,7 +19,9 @@ import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; +import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.download.AbstractDownloadManager; import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.cellbase.lib.download.Downloader; @@ -32,17 +34,11 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -/** - * Created by imedina on 03/02/15. - */ -public class DownloadCommandExecutor extends CommandExecutor { - private AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; - private Path outputDirectory; +public class DownloadCommandExecutor extends CommandExecutor { - public static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, - MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, - ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + private final AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; + private final Path outputDirectory; public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { super(downloadCommandOptions.commonOptions.logLevel, downloadCommandOptions.commonOptions.conf); @@ -58,46 +54,52 @@ public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions down */ public void execute() throws CellBaseException { try { + // Get the species and the assembly String species = downloadCommandOptions.speciesAndAssemblyOptions.species; String assembly = downloadCommandOptions.speciesAndAssemblyOptions.assembly; + + // Get the valid list of data sources + SpeciesConfiguration speciesConfig = SpeciesUtils.getSpeciesConfiguration(configuration, species); + List dataList = getDataList(species, speciesConfig); + logger.info("Downloading the following data sources: {}", StringUtils.join(dataList, ",")); + List downloadFiles = new ArrayList<>(); - List dataList = checkDataSources(); Downloader downloader = new Downloader(species, assembly, outputDirectory, configuration); for (String data : dataList) { switch (data) { case GENOME_DATA: downloadFiles.addAll(downloader.downloadGenome()); break; + case CONSERVATION_DATA: + downloadFiles.addAll(downloader.downloadConservation()); + break; + case REPEATS_DATA: + downloadFiles.addAll(downloader.downloadRepeats()); + break; case GENE_DATA: downloadFiles.addAll(downloader.downloadGene()); break; - case VARIATION_FUNCTIONAL_SCORE_DATA: - downloadFiles.addAll(downloader.downloadCaddScores()); - break; - case MISSENSE_VARIATION_SCORE_DATA: - downloadFiles.addAll(downloader.downloadPredictionScores()); + case PROTEIN_DATA: + downloadFiles.addAll(downloader.downloadProtein()); break; case REGULATION_DATA: downloadFiles.addAll(downloader.downloadRegulation()); break; - case PROTEIN_DATA: - downloadFiles.addAll(downloader.downloadProtein()); + case VARIATION_FUNCTIONAL_SCORE_DATA: + downloadFiles.addAll(downloader.downloadCaddScores()); break; - case CONSERVATION_DATA: - downloadFiles.addAll(downloader.downloadConservation()); + case MISSENSE_VARIATION_SCORE_DATA: + downloadFiles.addAll(downloader.downloadPredictionScores()); break; case CLINICAL_VARIANT_DATA: downloadFiles.addAll(downloader.downloadClinicalVariants()); break; - case REPEATS_DATA: - downloadFiles.addAll(downloader.downloadRepeats()); + case SPLICE_SCORE_DATA: + downloadFiles.addAll(downloader.downloadSpliceScores()); break; case ONTOLOGY_DATA: downloadFiles.addAll(downloader.downloadOntologies()); break; - case SPLICE_SCORE_DATA: - downloadFiles.addAll(downloader.downloadSpliceScores()); - break; case PUBMED_DATA: downloadFiles.addAll(downloader.downloadPubMed()); break; @@ -105,8 +107,9 @@ public void execute() throws CellBaseException { downloadFiles.addAll(downloader.downloadPharmKGB()); break; default: - throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " - + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); + throw new IllegalArgumentException("Data parameter '" + data + "' is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + + ". You can use data parameter 'all' to download everything"); } } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); @@ -115,21 +118,25 @@ public void execute() throws CellBaseException { Thread.currentThread().interrupt(); throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } catch (Exception e) { - e.printStackTrace(); throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } - private List checkDataSources() { - if (StringUtils.isEmpty(downloadCommandOptions.data)) { - throw new IllegalArgumentException("Missing data parameter. Valid values are: " - + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); - } - List dataList = Arrays.asList(downloadCommandOptions.data.split(",")); - for (String data : dataList) { - if (!VALID_SOURCES_TO_DOWNLOAD.contains(data)) { - throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " - + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); + private List getDataList(String species, SpeciesConfiguration speciesConfig) throws CellBaseException { + // No need to check if 'data' exists since it is declared as required in JCommander + List dataList; + if ("all".equalsIgnoreCase(downloadCommandOptions.data)) { + // Download all data sources for the species in the configuration.yml file + dataList = speciesConfig.getData(); + } else { + // Check if the data sources requested are valid for the species + dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + for (String data : dataList) { + if (!speciesConfig.getData().contains(data)) { + throw new CellBaseException("Data parameter '" + data + "' does not exist or it is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + ". " + + "You can use data parameter 'all' to download everything"); + } } } return dataList; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java deleted file mode 100644 index 70849eb924..0000000000 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.app.cli.admin.executors; - -import org.opencb.cellbase.app.cli.CommandExecutor; -import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.install.InstallManager; - -public class InstallCommandExecutor extends CommandExecutor { - - private AdminCliOptionsParser.InstallCommandOptions installCommandOptions; - - public InstallCommandExecutor(AdminCliOptionsParser.InstallCommandOptions installCommandOptions) { - super(installCommandOptions.commonOptions.logLevel, installCommandOptions.commonOptions.conf); - - this.installCommandOptions = installCommandOptions; - } - - public void execute() throws CellBaseException { - try { - logger.info("Starting installation ..."); - InstallManager installManager = new InstallManager(configuration); - installManager.install(installCommandOptions.speciesAndAssemblyOptions.species, - installCommandOptions.speciesAndAssemblyOptions.assembly); - } catch (CellBaseException e) { - logger.error("Error installing:" + e.toString()); - } - } -} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java index de470db66d..c30d3d6bea 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java @@ -19,7 +19,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.google.common.base.CaseFormat; -import org.apache.commons.lang.StringUtils; import org.opencb.commons.utils.FileUtils; import org.slf4j.LoggerFactory; @@ -27,7 +26,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Path; -import java.util.*; +import java.util.HashMap; +import java.util.Map; public class CellBaseConfiguration { @@ -135,10 +135,7 @@ private static DatabaseCredentials secureGetMongodb(CellBaseConfiguration config configuration.setDatabases(new Databases()); } if (configuration.getDatabases().getMongodb() == null) { - configuration.getDatabases().setMongodb(new MongoDBDatabaseCredentials()); - } - if (configuration.getDatabases().getMongodb().getShards() == null) { - configuration.getDatabases().getMongodb().setShards(Collections.emptyList()); + configuration.getDatabases().setMongodb(new DatabaseCredentials()); } if (configuration.getDatabases().getMongodb().getOptions() == null) { configuration.getDatabases().getMongodb().setOptions(new HashMap<>()); @@ -228,51 +225,6 @@ public void setSpecies(SpeciesProperties species) { this.species = species; } - /** - * get the config for this species. - * @param id shortName for species, e.g. hsapiens - * @return configuration for this species - */ - public SpeciesConfiguration getSpeciesConfig(String id) { - if (StringUtils.isEmpty(id)) { - return null; - } - List allSpecies = getAllSpecies(); - for (SpeciesConfiguration config : allSpecies) { - if (config.getId().equals(id)) { - return config; - } - } - return null; - } - - public List getAllSpecies() { - List allSpecies = new ArrayList<>(); - if (species.getVertebrates() != null && !species.getVertebrates().isEmpty()) { - allSpecies.addAll(species.getVertebrates()); - } - if (species.getMetazoa() != null && !species.getMetazoa().isEmpty()) { - allSpecies.addAll(species.getMetazoa()); - } - if (species.getFungi() != null && !species.getFungi().isEmpty()) { - allSpecies.addAll(species.getFungi()); - } - if (species.getProtist() != null && !species.getProtist().isEmpty()) { - allSpecies.addAll(species.getProtist()); - } - if (species.getPlants() != null && !species.getPlants().isEmpty()) { - allSpecies.addAll(species.getPlants()); - } - if (species.getVirus() != null && !species.getVirus().isEmpty()) { - allSpecies.addAll(species.getVirus()); - } - if (species.getBacteria() != null && !species.getBacteria().isEmpty()) { - allSpecies.addAll(species.getBacteria()); - } - - return allSpecies; - } - public ServerProperties getServer() { return server; } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java index 304c191d78..ab9c8a6e94 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java @@ -18,15 +18,13 @@ import java.util.Map; -/** - * Created by imedina on 19/08/16. - */ + public class DatabaseCredentials { - private String host; - private String user; - private String password; - private Map options; + protected String host; + protected String user; + protected String password; + protected Map options; public DatabaseCredentials() { } @@ -40,7 +38,7 @@ public DatabaseCredentials(String host, String user, String password, Map getOptions() { return options; } - public void setOptions(Map options) { + public DatabaseCredentials setOptions(Map options) { this.options = options; + return this; } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java index 4c0cf374c7..905780fcdb 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java @@ -16,48 +16,32 @@ package org.opencb.cellbase.core.config; -import java.util.Map; - -/** - * Created by imedina on 16/09/16. - */ public class Databases { - private MongoDBDatabaseCredentials mongodb; - private Map neo4j; + private DatabaseCredentials mongodb; public Databases() { } - public Databases(MongoDBDatabaseCredentials mongodb, Map neo4j) { + public Databases(DatabaseCredentials mongodb) { this.mongodb = mongodb; - this.neo4j = neo4j; } @Override public String toString() { final StringBuilder sb = new StringBuilder("Databases{"); sb.append("mongodb=").append(mongodb); - sb.append(", neo4j=").append(neo4j); sb.append('}'); return sb.toString(); } - public MongoDBDatabaseCredentials getMongodb() { + public DatabaseCredentials getMongodb() { return mongodb; } - public Databases setMongodb(MongoDBDatabaseCredentials mongodb) { + public Databases setMongodb(DatabaseCredentials mongodb) { this.mongodb = mongodb; return this; } - public Map getNeo4j() { - return neo4j; - } - - public Databases setNeo4j(Map neo4j) { - this.neo4j = neo4j; - return this; - } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java deleted file mode 100644 index 2582b24115..0000000000 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.core.config; - -import java.util.List; -import java.util.Map; - -/** - * Created by imedina on 19/08/16. - */ -public class MongoDBDatabaseCredentials extends DatabaseCredentials { - - private List shards; - private String host; - private String user; - private String password; - private Map options; - - public MongoDBDatabaseCredentials() { - } - - public MongoDBDatabaseCredentials(String host, String user, String password, List shards, Map options) { - super(host, user, password, options); - this.shards = shards; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("DatabaseProperties{"); - sb.append("host='").append(host).append('\''); - sb.append(", user='").append(user).append('\''); - sb.append(", password='").append(password).append('\''); - sb.append(", replicaSets='").append(shards).append('\''); - sb.append(", options=").append(options); - sb.append('}'); - return sb.toString(); - } - - public List getShards() { - return shards; - } - - public MongoDBDatabaseCredentials setShards(List shards) { - this.shards = shards; - return this; - } - - public static class ReplicaSet { - private String id; - private String nodes; - - /** - * @return the replicaset name, e.g. rs0 - */ - public String getId() { - return id; - } - - /** - * @param id label for the replicaset, e.g. rs0 - * @return the replicaset of interest - */ - public ReplicaSet setId(String id) { - this.id = id; - return this; - } - - /** - * @return nodes for replica set, e.g. cb-mongo-shard1-1:27017,cb-mongo-shard1-2:27017,cb-mongo-shard1-3:27017 - */ - public String getNodes() { - return nodes; - } - - /** - * @param nodes nodes for replica set, e.g. cb-mongo-shard1-1:27017,cb-mongo-shard1-2:27017,cb-mongo-shard1-3:27017 - * @return nodes for this replica set - */ - public ReplicaSet setNodes(String nodes) { - this.nodes = nodes; - return this; - } - } -} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java index 3a3fae4d9f..5c4976675c 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java @@ -18,9 +18,7 @@ import java.util.List; -/** - * Created by imedina on 19/08/16. - */ + public class SpeciesConfiguration { private String id; @@ -28,23 +26,19 @@ public class SpeciesConfiguration { private String commonName; private List assemblies; private List data; - private List shards; public SpeciesConfiguration() { } - public SpeciesConfiguration(String id, String scientificName, String commonName, List assemblies, List data, - List shards) { + public SpeciesConfiguration(String id, String scientificName, String commonName, List assemblies, List data) { this.id = id; this.scientificName = scientificName; this.commonName = commonName; this.assemblies = assemblies; this.data = data; - this.shards = shards; } - @Override public String toString() { final StringBuilder sb = new StringBuilder("Species{"); @@ -53,7 +47,6 @@ public String toString() { sb.append(", commonName='").append(commonName).append('\''); sb.append(", assemblies=").append(assemblies); sb.append(", data=").append(data); - sb.append(", shards=").append(shards); sb.append('}'); return sb.toString(); } @@ -62,40 +55,45 @@ public String getId() { return id; } - public void setId(String id) { + public SpeciesConfiguration setId(String id) { this.id = id; + return this; } public String getScientificName() { return scientificName; } - public void setScientificName(String scientificName) { + public SpeciesConfiguration setScientificName(String scientificName) { this.scientificName = scientificName; + return this; } public String getCommonName() { return commonName; } - public void setCommonName(String commonName) { + public SpeciesConfiguration setCommonName(String commonName) { this.commonName = commonName; + return this; } public List getAssemblies() { return assemblies; } - public void setAssemblies(List assemblies) { + public SpeciesConfiguration setAssemblies(List assemblies) { this.assemblies = assemblies; + return this; } public List getData() { return data; } - public void setData(List data) { + public SpeciesConfiguration setData(List data) { this.data = data; + return this; } public static class Assembly { @@ -103,126 +101,51 @@ public static class Assembly { private String ensemblVersion; private String ensemblCollection; // Only for bacteria - public String getName() { - return name; + public Assembly() { } - public void setName(String name) { + public Assembly(String ensemblCollection, String ensemblVersion, String name) { + this.ensemblCollection = ensemblCollection; + this.ensemblVersion = ensemblVersion; this.name = name; } - public String getEnsemblVersion() { - return ensemblVersion; - } - - public void setEnsemblVersion(String ensemblVersion) { - this.ensemblVersion = ensemblVersion; + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("Assembly{"); + sb.append("ensemblCollection='").append(ensemblCollection).append('\''); + sb.append(", name='").append(name).append('\''); + sb.append(", ensemblVersion='").append(ensemblVersion).append('\''); + sb.append('}'); + return sb.toString(); } public String getEnsemblCollection() { return ensemblCollection; } - public void setEnsemblCollection(String ensemblCollection) { + public Assembly setEnsemblCollection(String ensemblCollection) { this.ensemblCollection = ensemblCollection; - } - } - - public List getShards() { - return shards; - } - - public SpeciesConfiguration setShards(List shards) { - this.shards = shards; - return this; - } - - public static class ShardConfig { - private String collection; - private List key; - private String rangeKey; - private List zones; - - public String getCollection() { - return collection; - } - - public ShardConfig setCollection(String collection) { - this.collection = collection; - return this; - } - - public List getKey() { - return key; - } - - public ShardConfig setKey(List key) { - this.key = key; - return this; - } - - public String getRangeKey() { - return rangeKey; - } - - public ShardConfig setRangeKey(String rangeKey) { - this.rangeKey = rangeKey; return this; } - public List getZones() { - return zones; + public String getEnsemblVersion() { + return ensemblVersion; } - public ShardConfig setZones(List zones) { - this.zones = zones; + public Assembly setEnsemblVersion(String ensemblVersion) { + this.ensemblVersion = ensemblVersion; return this; } - } - - public static class Zone { - private String name; - private List shardRanges; public String getName() { return name; } - public Zone setName(String name) { + public Assembly setName(String name) { this.name = name; return this; } - - public List getShardRanges() { - return shardRanges; - } - - public Zone setShardRanges(List shardRanges) { - this.shardRanges = shardRanges; - return this; - } } - public static class ShardRange { - private String minimum; - private String maximum; - - public String getMinimum() { - return minimum; - } - - public ShardRange setMinimum(String minimum) { - this.minimum = minimum; - return this; - } - - public String getMaximum() { - return maximum; - } - - public ShardRange setMaximum(String maximum) { - this.maximum = maximum; - return this; - } - } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java index c928f783e4..39c0b7e0f3 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java @@ -16,12 +16,15 @@ package org.opencb.cellbase.core.utils; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.core.common.Species; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.config.SpeciesProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import java.util.ArrayList; import java.util.List; @@ -39,7 +42,7 @@ public class SpeciesUtils { */ public static Species getSpecies(CellBaseConfiguration configuration, String speciesStr, String assemblyStr) throws CellBaseException { Species species = null; - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { if (speciesStr.equalsIgnoreCase(sp.getScientificName()) || speciesStr.equalsIgnoreCase(sp.getCommonName()) || speciesStr.equalsIgnoreCase(sp.getId())) { SpeciesConfiguration.Assembly assembly; @@ -82,7 +85,7 @@ public static boolean validateSpeciesAndAssembly(CellBaseConfiguration configura return false; } - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { if (species.equalsIgnoreCase(sp.getScientificName()) || species.equalsIgnoreCase(sp.getCommonName()) || species.equalsIgnoreCase(sp.getId())) { return getAssembly(sp, assembly) != null; @@ -96,8 +99,9 @@ public static boolean validateSpecies(CellBaseConfiguration configuration, Strin return false; } - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { - if (species.equalsIgnoreCase(sp.getScientificName()) || species.equalsIgnoreCase(sp.getCommonName()) + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { + if (species.equalsIgnoreCase(sp.getScientificName()) + || species.equalsIgnoreCase(sp.getCommonName()) || species.equalsIgnoreCase(sp.getId())) { return true; } @@ -108,7 +112,7 @@ public static boolean validateSpecies(CellBaseConfiguration configuration, Strin public static SpeciesConfiguration getSpeciesConfiguration(CellBaseConfiguration configuration, String species) { SpeciesConfiguration speciesConfiguration = null; - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { if (species.equalsIgnoreCase(sp.getScientificName()) || species.equalsIgnoreCase(sp.getCommonName()) || species.equalsIgnoreCase(sp.getId())) { @@ -119,6 +123,11 @@ public static SpeciesConfiguration getSpeciesConfiguration(CellBaseConfiguration return speciesConfiguration; } + public static boolean hasData(CellBaseConfiguration configuration, String species, String data) { + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); + return CollectionUtils.isNotEmpty(speciesConfiguration.getData()) && speciesConfiguration.getData().contains(data); + } + /** * Get the default assembly for species. Is naive and just gets the first one. Order not guaranteed, don't rely on this at all. * @@ -134,6 +143,34 @@ public static SpeciesConfiguration.Assembly getDefaultAssembly(SpeciesConfigurat return assemblies.get(0); } + public static List getAllSpecies(CellBaseConfiguration cellBaseConfiguration) { + List allSpecies = new ArrayList<>(); + SpeciesProperties species = cellBaseConfiguration.getSpecies(); + if (species.getVertebrates() != null && !species.getVertebrates().isEmpty()) { + allSpecies.addAll(species.getVertebrates()); + } + if (species.getMetazoa() != null && !species.getMetazoa().isEmpty()) { + allSpecies.addAll(species.getMetazoa()); + } + if (species.getFungi() != null && !species.getFungi().isEmpty()) { + allSpecies.addAll(species.getFungi()); + } + if (species.getProtist() != null && !species.getProtist().isEmpty()) { + allSpecies.addAll(species.getProtist()); + } + if (species.getPlants() != null && !species.getPlants().isEmpty()) { + allSpecies.addAll(species.getPlants()); + } + if (species.getVirus() != null && !species.getVirus().isEmpty()) { + allSpecies.addAll(species.getVirus()); + } + if (species.getBacteria() != null && !species.getBacteria().isEmpty()) { + allSpecies.addAll(species.getBacteria()); + } + + return allSpecies; + } + /** * Get the default assembly for species. Is naive and just gets the first one. Order not guaranteed, don't rely on this at all. * diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 747bf94a7c..88d8d8a9fd 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -16,11 +16,6 @@ databases: host: "${CELLBASE.DB.MONGODB.HOST}" user: "${CELLBASE.DB.USER}" password: "${CELLBASE.DB.PASSWORD}" - shards: - - id: "${CELLBASE.DB.MONGODB.REPLICASET.0.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.0}" - - id: "${CELLBASE.DB.MONGODB.REPLICASET.1.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.1}" options: authenticationDatabase: "${CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" authenticationMechanism: "${CELLBASE.DB.MONGODB.AUTHENTICATION_MECHANISM}" @@ -28,18 +23,9 @@ databases: replicaSet: "${CELLBASE.DB.MONGODB.REPLICASET}" connectionsPerHost: 20 sslEnabled: false -# sslInvalidCertificatesAllowed: true -# sslInvalidHostnameAllowed: true + # sslInvalidCertificatesAllowed: true + # sslInvalidHostnameAllowed: true enableSharding: false - neo4j: - hsapiens: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" - mmusculus: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" server: rest: port: "${CELLBASE.SERVER.REST.PORT}" @@ -90,6 +76,18 @@ download: GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + MMUSCULUS_GENOMIC_GTF: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.gtf.gz + MMUSCULUS_GENOMIC_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.fna.gz + MMUSCULUS_PROTEIN_FAA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_protein.faa.gz + MMUSCULUS_RNA_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_rna.fna.gz + RNORVEGICUS_GENOMIC_GTF: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.gtf.gz + RNORVEGICUS_GENOMIC_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.fna.gz + RNORVEGICUS_PROTEIN_FAA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_protein.faa.gz + RNORVEGICUS_RNA_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_rna.fna.gz + BTAURUS_GENOMIC_GTF: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.gtf.gz + BTAURUS_GENOMIC_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz + BTAURUS_PROTEIN_FAA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_protein.faa.gz + BTAURUS_RNA_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_rna.fna.gz maneSelect: host: https://ftp.ncbi.nlm.nih.gov/refseq/ version: "1.2" @@ -116,10 +114,16 @@ download: files: DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: http://ftp.uniprot.org/ + host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ version: "2024-03-27" files: - UNIPROT_XREF: pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz + UNIPROT_XREF: HUMAN_9606_idmapping_selected.tab.gz + MMUSCULUS_UNIPROT_XREF: MOUSE_10090_idmapping_selected.tab.gz + RNORVEGICUS_UNIPROT_XREF: RAT_10116_idmapping_selected.tab.gz + DRERIO_UNIPROT_XREF: DANRE_7955_idmapping_selected.tab.gz + DMELOANOGASTER_UNIPROT_XREF: DROME_7227_idmapping_selected.tab.gz + SCEREVISIAE_UNIPROT_XREF: YEAST_559292_idmapping_selected.tab.gz + CELEGANS_UNIPROT_XREF: CAEEL_6239_idmapping_selected.tab.gz geneExpressionAtlas: host: https://ftp.ebi.ac.uk/ version: "2.0.14" @@ -145,6 +149,7 @@ download: host: http://geneontology.org/ files: GO_ANNOTATION: gene-associations/goa_human.gaf.gz + MMUSCULUS_GO_ANNOTATION: gene-associations/mgi.gaf.gz cancerGeneCensus: ## To be downloaded manually host: https://cancer.sanger.ac.uk/census/ @@ -167,6 +172,8 @@ download: # This file contains errors and has to be fixed before building # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx + RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx ## Protein Data uniprot: @@ -191,18 +198,21 @@ download: host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" files: - PHASTCONS: goldenPath/put_assembly_here/phastCons470way/put_assembly_here.470way.phastCons/chrput_chromosome_here.phastCons470way.wigFix.gz + PHASTCONS: goldenPath/hg38/phastCons470way/hg38.470way.phastCons/ + MMUSCULUS_PHASTCONS: goldenPath/mm39/phastCons35way/mm39.35way.phastCons/ phylop: ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" files: - PHYLOP: goldenPath/put_assembly_here/phyloP470way/put_assembly_here.470way.phyloP/chrput_chromosome_here.phyloP470way.wigFix.gz + PHYLOP: goldenPath/hg38/phyloP470way/hg38.470way.phyloP/ + MMUSCULUS_PHYLOP: goldenPath/mm39/phyloP35way/mm39.35way.phyloP/ gerp: host: http://ftp.ensembl.org/ version: "2023-05-17" files: GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + MMUSCULUS_GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.mus_musculus.GRCm39.bw ## Clinical Variant clinvar: @@ -232,24 +242,36 @@ download: files: GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz + pharmGKB: + host: https://api.pharmgkb.org/v1/download/file/data/ + version: v1 + files: + GENES: genes.zip + CHEMICALS: chemicals.zip + VARIANTS: variants.zip + GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip + VARIANT_ANNOTATIONS: variantAnnotations.zip + CLINICAL_ANNOTATIONS: clinicalAnnotations.zip + CLINICAL_VARIANTS: clinicalVariants.zip + DRUG_LABELS: drugLabels.zip + RELATIONSHIPS: relationships.zip dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: host: http://hgdownload.cse.ucsc.edu/ files: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz + SIMPLE_REPEATS: goldenPath/hg38/database/simpleRepeat.txt.gz + MMUSCULUS_SIMPLE_REPEATS: goldenPath/mm39/database/simpleRepeat.txt.gz windowMasker: host: http://hgdownload.cse.ucsc.edu/ files: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz + WINDOW_MASKER: goldenPath/hg38/database/windowmaskerSdust.txt.gz + MMUSCULUS_WINDOW_MASKER: goldenPath/mm39/database/windowmaskerSdust.txt.gz genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/ files: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + GENOMIC_SUPER_DUPS: goldenPath/hg38/database/genomicSuperDups.txt.gz ## Variant Pathogenic Prediction revel: @@ -263,28 +285,22 @@ download: files: CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz - reactome: - host: http://www.reactome.org/download/current/biopax.zip - ## OBO Ontologies + ## The version is retrieved from the OBO file hpoObo: host: http://purl.obolibrary.org/obo/ - ## The version is retrieved from the OBO file files: HPO: hp.obo goObo: host: http://purl.obolibrary.org/obo/ - ## The version is retrieved from the OBO file files: GO: go/go-basic.obo doidObo: host: http://purl.obolibrary.org/obo/ - ## The version is retrieved from the OBO file files: DOID: doid.obo mondoObo: host: http://purl.obolibrary.org/obo/ - ## The version is retrieved from the OBO file files: MONDO: mondo.obo @@ -302,19 +318,10 @@ download: version: 2024 files: PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz - pharmGKB: - host: https://api.pharmgkb.org/v1/download/file/data/ - version: v1 - files: - GENES: genes.zip - CHEMICALS: chemicals.zip - VARIANTS: variants.zip - GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip - VARIANT_ANNOTATIONS: variantAnnotations.zip - CLINICAL_ANNOTATIONS: clinicalAnnotations.zip - CLINICAL_VARIANTS: clinicalVariants.zip - DRUG_LABELS: drugLabels.zip - RELATIONSHIPS: relationships.zip + reactome: + host: http://www.reactome.org/download/current/biopax.zip + + species: vertebrates: - id: hsapiens @@ -322,74 +329,74 @@ species: assemblies: - ensemblVersion: '111_38' name: GRCh38 - - ensemblVersion: '82_37' - name: GRCh37 +# - ensemblVersion: '82_37' +# name: GRCh37 data: - - clinical_variants + - genome - conservation + - repeats - gene - - genome + - regulation + - protein + - clinical_variant - missense_variation_functional_score - ontology - - protein - - refseq - - regulation - - repeats - variation_functional_score - splice_score - shards: - - collection: "variation" - key: - - chromosome - - start - - end - rangeKey: "chromosome" - zones: - - name: "zone0" - shardRanges: - - minimum: "1" - maximum: "10" - - minimum: "2" - maximum: "20" - - minimum: "3" - maximum: "9" - - name: "zone1" - shardRanges: - - minimum: "10" - maximum: "2" - - minimum: "20" - maximum: "3" - - minimum: "9" - maximum: "Z" + - pharmacogenomics - id: mmusculus scientificName: Mus musculus assemblies: - - ensemblVersion: '82_38' - name: GRCm38 + - ensemblVersion: '111_39' + name: GRCm39 data: - genome - - genome_info + - conservation + - repeats - gene - - variation - regulation - protein - - conservation +# - variation + - id: rnorvegicus + scientificName: Rattus norvegicus + assemblies: + - ensemblVersion: '111_7.2' + name: mRatBN7.2 + data: + - genome + - gene + - regulation + - protein +# - variation - id: drerio scientificName: Danio rerio assemblies: - - ensemblVersion: '82_10' - name: GRCz10 + - ensemblVersion: '111_11' + name: GRCz11 + data: + - genome + - gene + - regulation + - protein +# - variation + - id: btaurus + scientificName: Bos taurus + assemblies: + - ensemblVersion: '111_1.3' + name: ARS-UCD1.3 data: - genome - genome_info - gene +# - refseq + - regulation - variation - protein - - id: rnorvegicus - scientificName: Rattus norvegicus + - id: sscrofa + scientificName: Sus scrofa assemblies: - - ensemblVersion: '82_6' - name: Rnor_6.0 + - ensemblVersion: '111_11.1' + name: Sscrofa11.1 data: - genome - genome_info diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 64ce73d692..9031275f2b 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -14,11 +14,6 @@ databases: host: "${CELLBASE.DB.MONGODB.HOST}" user: "${CELLBASE.DB.USER}" password: "${CELLBASE.DB.PASSWORD}" - shards: - - id: "${CELLBASE.DB.MONGODB.REPLICASET.0.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.0}" - - id: "${CELLBASE.DB.MONGODB.REPLICASET.1.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.1}" options: authenticationDatabase: "${CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" readPreference: "${CELLBASE.DB.MONGODB.READPREFERENCE}" @@ -26,15 +21,6 @@ databases: connectionsPerHost: 20 sslEnabled: false enableSharding: true - neo4j: - hsapiens: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" - mmusculus: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" server: rest: port: 9090 @@ -162,30 +148,6 @@ species: - repeats - variation_functional_score - splice_score - shards: - - collection: "variation" - key: - - chromosome - - start - - end - rangeKey: "chromosome" - zones: - - name: "zone0" - shardRanges: - - minimum: "1" - maximum: "10" - - minimum: "2" - maximum: "20" - - minimum: "3" - maximum: "9" - - name: "zone1" - shardRanges: - - minimum: "10" - maximum: "2" - - minimum: "20" - maximum: "3" - - minimum: "9" - maximum: "Z" - id: mmusculus scientificName: Mus musculus assemblies: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 8b8bbe7075..f2c8ffed15 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -47,6 +47,10 @@ public final class EtlCommons { // Commons public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; public static final String HSAPIENS_NAME= "hsapiens"; + public static final String MUS_MUSCULUS_NAME= "Mus musculus"; + public static final String RATTUS_NORVEGICUS_NAME= "Rattus norvegicus"; + public static final String BOS_TAURUS_NAME= "Bos taurus"; + public static final String DANIO_RERIO_NAME= "Danio rerio"; public static final String GRCH38_NAME = "GRCh38"; public static final String GRCH37_NAME = "GRCh37"; @@ -88,6 +92,7 @@ public final class EtlCommons { // Genome public static final String GENOME_DATA = "genome"; + public static final String GENOME_INFO_DATA = "genome_info"; // Gene public static final String GENE_DATA = "gene"; @@ -347,6 +352,7 @@ public final class EtlCommons { dataNamesMap.put(ENSEMBL_DATA, "Ensembl"); dataNamesMap.put(REFSEQ_DATA, "RefSeq"); dataNamesMap.put(GENOME_DATA, "Genome"); + dataNamesMap.put(GENOME_INFO_DATA, "Genome Info"); dataNamesMap.put(GENE_DATA, "Gene"); dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index eeb91729a5..4056dd18b4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -39,9 +39,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -/** - * Created by imedina on 30/08/14. - */ + public abstract class CellBaseBuilder { protected CellBaseSerializer serializer; @@ -82,6 +80,15 @@ public void disconnect() { } } + protected String getConfigurationFileIdPrefix(String scientificSpecies) { + String prefix = ""; + if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) { + char c = scientificSpecies.charAt(0); + prefix = (c + scientificSpecies.split(" ")[1] + "_").toUpperCase(); + } + return prefix; + } + protected File checkFile(DownloadProperties.URLProperties props, String fileId, Path targetPath, String name) throws CellBaseException { logger.info("Checking file {} (file ID {} in config.) ...", name, fileId); String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index 27098bcf14..e8ea728da3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -134,20 +134,23 @@ public void check() throws Exception { geneDescriptionFile = checkFile(props, ENSEMBL_DESCRIPTION_FILE_ID, downloadPath.getParent(), "Ensembl Description").toPath(); xrefsFile = checkFile(props, ENSEMBL_XREFS_FILE_ID, downloadPath.getParent(), "Ensembl Xrefs").toPath(); ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath(); - tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); - eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); - maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); - lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); - hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); - cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); - geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); - uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); - geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); - hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); - disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); - gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); - geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); - cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); + eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } // Check regulation files // Motif features diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index b291d2f9cd..7b3b9f345b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -103,13 +103,14 @@ public void check() throws Exception { } // Check RefSeq files + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); DownloadProperties.URLProperties props = configuration.getDownload().getRefSeq(); - gtfFile = checkFile(props, REFSEQ_GENOMIC_GTF_FILE_ID, downloadPath, "RefSeq GTF").toPath(); - proteinFastaFile = checkFile(props, REFSEQ_PROTEIN_FAA_FILE_ID, downloadPath, "RefSeq Protein FAA").toPath(); - cdnaFastaFile = checkFile(props, REFSEQ_RNA_FNA_FILE_ID, downloadPath, "RefSeq RNA FNA").toPath(); + gtfFile = checkFile(props, prefixId + REFSEQ_GENOMIC_GTF_FILE_ID, downloadPath, "RefSeq GTF").toPath(); + proteinFastaFile = checkFile(props, prefixId + REFSEQ_PROTEIN_FAA_FILE_ID, downloadPath, "RefSeq Protein FAA").toPath(); + cdnaFastaFile = checkFile(props, prefixId + REFSEQ_RNA_FNA_FILE_ID, downloadPath, "RefSeq RNA FNA").toPath(); // Check genome FASTA file - String genomeGzFilename = Paths.get(props.getFiles().get(REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString(); + String genomeGzFilename = Paths.get(props.getFiles().get(prefixId + REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString(); fastaFile = downloadPath.resolve(genomeGzFilename); if (Files.exists(fastaFile)) { // Need to be gunzip-ed @@ -132,16 +133,17 @@ public void check() throws Exception { // Check common files props = configuration.getDownload().getEnsembl().getUrl(); - tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); - eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); - - maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); - lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); - cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); - geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); - hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); - disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); - cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); + eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } // Check regulation files // mirtarbase diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java index d78c0446c8..27e3239f94 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java @@ -52,7 +52,7 @@ public class MongoDBManager { public static final String DBNAME_SEPARATOR = "_"; private MongoDataStoreManager mongoDataStoreManager; - private CellBaseConfiguration cellBaseConfiguration; + private final CellBaseConfiguration cellBaseConfiguration; private Logger logger; @@ -105,13 +105,13 @@ public MongoDataStore createMongoDBDatastore(String speciesStr, String assemblyS } catch (CellBaseException e) { e.printStackTrace(); logger.error("Species name is not valid: '{}'. Valid species: {}", speciesStr, - String.join(",", cellBaseConfiguration.getAllSpecies().stream().map((tmpSpeciesObject) - -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) + String.join(",", SpeciesUtils.getAllSpecies(cellBaseConfiguration).stream().map((tmpSpeciesObject) + -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) .collect(Collectors.toList()))); throw new InvalidParameterException("Species name is not valid: '" + speciesStr + "'. Please provide one" + " of supported species: {" - + String.join(",", cellBaseConfiguration.getAllSpecies().stream().map((tmpSpeciesObject) - -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) + + String.join(",", SpeciesUtils.getAllSpecies(cellBaseConfiguration).stream().map((tmpSpeciesObject) + -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) .collect(Collectors.toList())) + "}"); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index a87faeb611..df57b06f8b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -134,15 +134,24 @@ private void init() throws CellBaseException, IOException { public abstract List download() throws IOException, InterruptedException, CellBaseException; - protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { + protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String data) { boolean hasInfo = true; - if (sp.getData() == null || !sp.getData().contains(info)) { - logger.warn("Species '{}' has no '{}' information available to download", sp.getScientificName(), info); + if (sp.getData() == null || !sp.getData().contains(data)) { + logger.warn("Species '{}' has no '{}' information available to download", sp.getScientificName(), data); hasInfo = false; } return hasInfo; } + protected String getConfigurationFileIdPrefix(String scientificSpecies) { + String prefix = ""; + if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) { + char c = scientificSpecies.charAt(0); + prefix = (c + scientificSpecies.split(" ")[1] + "_").toUpperCase(); + } + return prefix; + } + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, Path outPath) throws IOException, InterruptedException, CellBaseException { return downloadAndSaveDataSource(props, fileId, data, null, outPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 0b0d09f412..e010bb676a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -18,6 +18,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; @@ -36,24 +37,20 @@ public CaddDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + DownloadFile downloadFile = null; - if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA) - || !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("{}/{} not supported for species {}", getDataCategory(CADD_DATA), getDataName(CADD_DATA), - speciesConfiguration.getScientificName()); - return Collections.emptyList(); - } + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_FUNCTIONAL_SCORE_DATA)) { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); - // Create the CADD download path - Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); - Files.createDirectories(caddDownloadPath); + // Create the CADD download path + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Files.createDirectories(caddDownloadPath); - // Download CADD and save data source - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, - caddDownloadPath); + // Download CADD and save data source + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, caddDownloadPath); - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + } return Collections.singletonList(downloadFile); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 9fd0e7562c..298634c6eb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -19,6 +19,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; @@ -40,61 +41,59 @@ public ClinicalDownloadManager(String species, String assembly, Path outdir, Cel @Override public List download() throws IOException, InterruptedException, CellBaseException { - List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadClinical()); - return downloadFiles; + return downloadClinical(); } public List downloadClinical() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); - if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("{} not supported for the species {}", getDataName(CLINICAL_VARIANT_DATA), - speciesConfiguration.getScientificName()); - return Collections.emptyList(); - } - - // Create clinical directory - Path clinicalPath = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANT_DATA).toAbsolutePath(); - Files.createDirectories(clinicalPath); - - DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); - // ClinVar - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINVAR_DATA)); - DownloadProperties.URLProperties props = configuration.getDownload().getClinvar(); - List urls = new ArrayList<>(); - for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, - CLINVAR_EFO_TERMS_FILE_ID)) { - downloadFile = downloadDataSource(props, fileId, clinicalPath); + // Check if the species has the data to download + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CLINICAL_VARIANT_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + + // Create clinical directory + Path clinicalPath = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANT_DATA).toAbsolutePath(); + Files.createDirectories(clinicalPath); + + DownloadFile downloadFile; + + // ClinVar + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINVAR_DATA)); + DownloadProperties.URLProperties props = configuration.getDownload().getClinvar(); + List urls = new ArrayList<>(); + for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, + CLINVAR_EFO_TERMS_FILE_ID)) { + downloadFile = downloadDataSource(props, fileId, clinicalPath); + downloadFiles.add(downloadFile); + + // Save URLs to be written in the version file + urls.add(downloadFile.getUrl()); + } + // Save data source + saveDataSource(CLINVAR_DATA, props.getVersion(), getTimeStamp(), urls, + clinicalPath.resolve(getDataVersionFilename(CLINVAR_DATA))); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CLINVAR_DATA)); + + // COSMIC + logger.warn("{} files must be downloaded manually !", getDataName(COSMIC_DATA)); + props = configuration.getDownload().getCosmic(); + String url = props.getHost() + props.getFiles().get(COSMIC_FILE_ID); + saveDataSource(COSMIC_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(COSMIC_DATA))); + + // HGMD + logger.warn("{} files must be downloaded manually !", getDataName(HGMD_DATA)); + props = configuration.getDownload().getHgmd(); + url = props.getHost() + props.getFiles().get(HGMD_FILE_ID); + saveDataSource(HGMD_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(HGMD_DATA))); + + // GWAS catalog + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GWAS_DATA)); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_DATA, clinicalPath); downloadFiles.add(downloadFile); - - // Save URLs to be written in the version file - urls.add(downloadFile.getUrl()); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GWAS_DATA)); } - // Save data source - saveDataSource(CLINVAR_DATA, props.getVersion(), getTimeStamp(), urls, clinicalPath.resolve(getDataVersionFilename(CLINVAR_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CLINVAR_DATA)); - - // COSMIC - logger.warn("{} files must be downloaded manually !", getDataName(COSMIC_DATA)); - props = configuration.getDownload().getCosmic(); - String url = props.getHost() + props.getFiles().get(COSMIC_FILE_ID); - saveDataSource(COSMIC_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), - clinicalPath.resolve(getDataVersionFilename(COSMIC_DATA))); - - // HGMD - logger.warn("{} files must be downloaded manually !", getDataName(HGMD_DATA)); - props = configuration.getDownload().getHgmd(); - url = props.getHost() + props.getFiles().get(HGMD_FILE_ID); - saveDataSource(HGMD_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), - clinicalPath.resolve(getDataVersionFilename(HGMD_DATA))); - - // GWAS catalog - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GWAS_DATA)); - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_DATA, clinicalPath); - downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GWAS_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java new file mode 100644 index 0000000000..f9a33b5c9c --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java @@ -0,0 +1,156 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class ConservationDownloadManager extends AbstractDownloadManager { + + public ConservationDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException, CellBaseException { + return downloadConservation(); + } + + /** + * This method downloads both PhastCons and PhyloP data from UCSC for Human and Mouse species. + * @return list of files downloaded + * @throws IOException if there is an error writing to a file + * @throws InterruptedException if there is an error downloading files + * @throws CellBaseException if there is an error executing the command line + */ + public List downloadConservation() throws IOException, InterruptedException, CellBaseException { + List downloadFiles = new ArrayList<>(); + + // Check if the species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CONSERVATION_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + + // Create folders + Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); + Files.createDirectories(conservationFolder); + Files.createDirectories(conservationFolder.resolve(GERP_DATA)); + Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); + Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); + + // Download data + String filename; + Path outputPath; + + // Prepare variables + String phastconsHost = configuration.getDownload().getPhastCons().getHost(); + String phylopHost = configuration.getDownload().getPhylop().getHost(); + List phastconsUrls = new ArrayList<>(50); + List phyloPUrls = new ArrayList<>(50); + String gerpUrl = null; + + // Human + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + // 1. PhastCons and PhyloP + String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"}; + for (String chromosome : chromosomes) { + logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome); + String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons470way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + phastconsUrls.add(phastConsUrl); + + logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP470way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + phyloPUrls.add(phyloPUrl); + } + + // 2. Gerp + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + } + + // Mouse + if (speciesConfiguration.getScientificName().equals(MUS_MUSCULUS_NAME)) { + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + // 1. PhastCons and PhyloP + String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "X", "Y", "M"}; + for (String chromosome : chromosomes) { + logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome); + String phastConsUrl = phastconsHost + + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons35way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + phastconsUrls.add(phastConsUrl); + + logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP35way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + phyloPUrls.add(phyloPUrl); + } + + // 2. Gerp + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + } + + // Save data version + saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, + conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); + saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, + conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); + saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA))); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + } + return downloadFiles; + } + +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadFile.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadFile.java index 5cc11acf64..079a5c921d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadFile.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadFile.java @@ -19,6 +19,7 @@ import java.util.concurrent.TimeUnit; public class DownloadFile { + private String startTime; private String elapsedTime; private Status status; @@ -38,6 +39,20 @@ public DownloadFile(String url, String outputFile, String startTime) { this.startTime = startTime; } + @Override + public String toString() { + return "DownloadFile{" + + "startTime='" + startTime + '\'' + + ", elapsedTime='" + elapsedTime + '\'' + + ", status=" + status + + ", message='" + message + '\'' + + ", expectedFileSize=" + expectedFileSize + + ", actualFileSize=" + actualFileSize + + ", outputFile='" + outputFile + '\'' + + ", url='" + url + '\'' + + '}'; + } + public String getStartTime() { return startTime; } @@ -47,7 +62,7 @@ public String getElapsedTime() { } public DownloadFile setElapsedTime(Long startTime, Long endTime) { - Long elapsedTime = endTime - startTime; + long elapsedTime = endTime - startTime; this.elapsedTime = TimeUnit.MILLISECONDS.toSeconds(elapsedTime) + " seconds"; return this; } @@ -96,17 +111,4 @@ public DownloadFile setMessage(String message) { return this; } - @Override - public String toString() { - return "DownloadFile{" - + "startTime='" + startTime + '\'' - + ", elapsedTime='" + elapsedTime + '\'' - + ", status=" + status - + ", message='" + message + '\'' - + ", expectedFileSize=" + expectedFileSize - + ", actualFileSize=" + actualFileSize - + ", outputFile='" + outputFile + '\'' - + ", url='" + url + '\'' - + '}'; - } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index cb412fd2bd..0d3203e7e2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -16,7 +16,6 @@ package org.opencb.cellbase.lib.download; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; @@ -26,10 +25,10 @@ public class Downloader { - private String species; - private String assembly; - private Path outputDirectory; - private CellBaseConfiguration configuration; + private final String species; + private final String assembly; + private final Path outputDirectory; + private final CellBaseConfiguration configuration; public Downloader(String species, String assembly, Path outputDirectory, CellBaseConfiguration configuration) { this.species = species; @@ -40,7 +39,12 @@ public Downloader(String species, String assembly, Path outputDirectory, CellBas public List downloadGenome() throws IOException, CellBaseException, InterruptedException { GenomeDownloadManager manager = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); - return manager.downloadReferenceGenome(); + return manager.download(); + } + + public List downloadRepeats() throws IOException, CellBaseException, InterruptedException { + RepeatsDownloadManager manager = new RepeatsDownloadManager(species, assembly, outputDirectory, configuration); + return manager.downloadRepeats(); } public List downloadGene() throws IOException, CellBaseException, InterruptedException { @@ -48,8 +52,7 @@ public List downloadGene() throws IOException, CellBaseException, return manager.download(); } - public List downloadRegulation() throws IOException, CellBaseException, InterruptedException, - NoSuchMethodException, FileFormatException { + public List downloadRegulation() throws IOException, CellBaseException, InterruptedException { RegulationDownloadManager manager = new RegulationDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); } @@ -60,7 +63,7 @@ public List downloadProtein() throws IOException, CellBaseExceptio } public List downloadConservation() throws IOException, CellBaseException, InterruptedException { - GenomeDownloadManager manager = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); + ConservationDownloadManager manager = new ConservationDownloadManager(species, assembly, outputDirectory, configuration); return manager.downloadConservation(); } @@ -69,11 +72,6 @@ public List downloadClinicalVariants() throws IOException, CellBas return manager.download(); } - public List downloadRepeats() throws IOException, CellBaseException, InterruptedException { - GenomeDownloadManager manager = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); - return manager.downloadRepeats(); - } - public List downloadOntologies() throws IOException, CellBaseException, InterruptedException { OntologyDownloadManager manager = new OntologyDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 06f568d103..57eff8d865 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -19,6 +19,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; @@ -34,11 +35,12 @@ public class GeneDownloadManager extends AbstractDownloadManager { static { GENE_UNIPROT_XREF_FILES = new HashMap<>(); GENE_UNIPROT_XREF_FILES.put(HOMO_SAPIENS_NAME, "HUMAN_9606_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put("Mus musculus", "MOUSE_10090_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put("Rattus norvegicus", "RAT_10116_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put("Danio rerio", "DANRE_7955_idmapping_selected.tab.gz"); + GENE_UNIPROT_XREF_FILES.put(MUS_MUSCULUS_NAME, "MOUSE_10090_idmapping_selected.tab.gz"); + GENE_UNIPROT_XREF_FILES.put(RATTUS_NORVEGICUS_NAME, "RAT_10116_idmapping_selected.tab.gz"); + GENE_UNIPROT_XREF_FILES.put(DANIO_RERIO_NAME, "DANRE_7955_idmapping_selected.tab.gz"); GENE_UNIPROT_XREF_FILES.put("Drosophila melanogaster", "DROME_7227_idmapping_selected.tab.gz"); GENE_UNIPROT_XREF_FILES.put("Saccharomyces cerevisiae", "YEAST_559292_idmapping_selected.tab.gz"); + GENE_UNIPROT_XREF_FILES.put("Caenorhabditis elegans", "CAEEL_6239_idmapping_selected.tab.gz"); } public GeneDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -90,6 +92,7 @@ public List download() throws IOException, InterruptedException, C geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + // Cancer gene census saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)), @@ -103,177 +106,208 @@ public List download() throws IOException, InterruptedException, C } private List downloadEnsemblData(Path ensemblDownloadPath) throws IOException, InterruptedException, CellBaseException { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); - List downloadFiles = new ArrayList<>(); - DownloadProperties.EnsemblProperties ensemblProps = configuration.getDownload().getEnsembl(); - - // GTF - downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_GTF_FILE_ID, ensemblDownloadPath)); - // PEP - downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_PEP_FA_FILE_ID, ensemblDownloadPath)); - // CDNA - downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_CDNA_FA_FILE_ID, ensemblDownloadPath)); - - // Save data source (i.e., metadata) - List urls = getUrls(downloadFiles); - // Add manually downloaded files - urls.addAll(getManualUrls(ensemblProps.getUrl())); - saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), urls, - ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); - - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); + + // Check if the species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); + DownloadProperties.EnsemblProperties ensemblConfig = configuration.getDownload().getEnsembl(); + + // GTF, DNA, RNA + downloadFiles.add(downloadEnsemblDataSource(ensemblConfig, ENSEMBL_GTF_FILE_ID, ensemblDownloadPath)); + downloadFiles.add(downloadEnsemblDataSource(ensemblConfig, ENSEMBL_PEP_FA_FILE_ID, ensemblDownloadPath)); + downloadFiles.add(downloadEnsemblDataSource(ensemblConfig, ENSEMBL_CDNA_FA_FILE_ID, ensemblDownloadPath)); + + // Save data source (i.e., metadata) + List urls = getUrls(downloadFiles); + + // Add manually downloaded files + urls.addAll(getManualUrls(ensemblConfig.getUrl())); + + saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), urls, + ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); + + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); + } + return downloadFiles; } private List downloadRefSeq(Path refSeqDownloadPath) throws IOException, InterruptedException, CellBaseException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); + List downloadFiles = new ArrayList<>(); - List downloadFiles = new ArrayList<>(); - DownloadProperties.URLProperties refSeqProps = configuration.getDownload().getRefSeq(); + // Check if the species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); - // GTF - downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); - // Genomic FASTA - downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_GENOMIC_FNA_FILE_ID, refSeqDownloadPath)); - // Protein FASTA - downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_PROTEIN_FAA_FILE_ID, refSeqDownloadPath)); - // cDNA - downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_RNA_FNA_FILE_ID, refSeqDownloadPath)); + // GTF, DNA, RNA, Protein + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq(); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_FNA_FILE_ID, refSeqDownloadPath)); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_RNA_FNA_FILE_ID, refSeqDownloadPath)); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_PROTEIN_FAA_FILE_ID, refSeqDownloadPath)); // Save data source (i.e., metadata) - saveDataSource(REFSEQ_DATA, refSeqProps.getVersion(), getTimeStamp(), getUrls(downloadFiles), + saveDataSource(REFSEQ_DATA, refSeqConfig.getVersion(), getTimeStamp(), getUrls(downloadFiles), refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA))); logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); - return downloadFiles; } - return Collections.emptyList(); + return downloadFiles; } private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, MANE_SELECT_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(LRG_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(HGNC_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, CANCER_HOTSPOT_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(DGIDB_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, - UNIPROT_XREF_DATA, geneDownloadPath); + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), + prefixId + UNIPROT_XREF_FILE_ID, UNIPROT_XREF_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + DownloadFile downloadFile = null; + + // Check if the species is supported + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), - GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_DATA, geneDownloadPath); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), + GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + } return downloadFile; } private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); + DownloadFile downloadFile = null; + + // Check if the species is supported + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); - // DisGeNet - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_DATA, - geneDownloadPath); + // DisGeNet + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), + DISGENET_FILE_ID, DISGENET_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); + } return downloadFile; } private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = null; + + // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, GNOMAD_CONSTRAINTS_DATA, geneDownloadPath); logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); - return downloadFile; } - return null; + return downloadFile; } private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + DownloadFile downloadFile = null; + + // Check if the species is supported + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + || speciesConfiguration.getScientificName().equals("Mus musculus")) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, - GO_ANNOTATION_DATA, geneDownloadPath); + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), + prefixId + GO_ANNOTATION_FILE_ID, GO_ANNOTATION_DATA, geneDownloadPath); logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); - return downloadFile; } - return null; + return downloadFile; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java deleted file mode 100644 index 9b967eb052..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.download; - -import com.beust.jcommander.ParameterException; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import static org.opencb.cellbase.lib.EtlCommons.*; - -public class GenomeDownloadManager extends AbstractDownloadManager { - - public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) - throws IOException, CellBaseException { - super(species, assembly, targetDirectory, configuration); - } - - @Override - public List download() throws IOException, InterruptedException, CellBaseException { - return downloadReferenceGenome(); - } - - public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); - Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); - Files.createDirectories(sequenceFolder); - - // Reference genome sequences are downloaded from Ensembl - // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead - DownloadFile downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PRIMARY_FA_FILE_ID, - sequenceFolder); - - // Save data source - saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA))); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); - - return Collections.singletonList(downloadFile); - } - - /** - * This method downloads bith PhastCons and PhyloP data from UCSC for Human and Mouse species. - * @return list of files downloaded - * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files - * @throws CellBaseException if there is an error executing the command line - */ - public List downloadConservation() throws IOException, InterruptedException, CellBaseException { - if (!speciesHasInfoToDownload(speciesConfiguration, CONSERVATION_DATA)) { - return Collections.emptyList(); - } - List downloadFiles = new ArrayList<>(); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); - Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); - - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(GERP_DATA)); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); - - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; - - if (assemblyConfiguration.getName().equalsIgnoreCase(GRCH38_NAME)) { - String filename; - Path outputPath; - String assembly = HG38_NAME; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - // Downloading PhastCons and PhyloP - logger.info(DOWNLOADING_LOG_MESSAGE, (getDataName(PHASTCONS_DATA) + "/" + getDataName(PHYLOP_DATA))); - for (String chromosome : chromosomes) { - // PhastCons - String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() - .getFiles().get(PHASTCONS_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) - .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - phastconsUrls.add(phastConsUrl); - - // PhyloP - String phyloPUrl = configuration.getDownload().getPhylop().getHost() + configuration.getDownload().getPhylop() - .getFiles().get(PHYLOP_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) - .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - phyloPUrls.add(phyloPUrl); - } - - // Downloading Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() - .get(GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - - - // Save data version - saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, - conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); - saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, - conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); - saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), - Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA))); - } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); - } - - return downloadFiles; - } - - public List downloadRepeats() throws IOException, InterruptedException, CellBaseException { - if (!speciesHasInfoToDownload(speciesConfiguration, REPEATS_DATA)) { - return Collections.emptyList(); - } - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); - Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); - Files.createDirectories(repeatsFolder); - List downloadFiles = new ArrayList<>(); - String pathParam; - if (assemblyConfiguration.getName().equalsIgnoreCase(GRCH38_NAME)) { - pathParam = HG38_NAME; - } else { - logger.error("Please provide a valid human assembly: {}, {}", GRCH37_NAME, GRCH38_NAME); - throw new ParameterException("Assembly '" + assemblyConfiguration.getName() + "' is not valid. Please provide " - + "a valid human assembly: " + GRCH37_NAME + ", " + GRCH38_NAME); - } - - // Download tandem repeat finder - String url = configuration.getDownload().getSimpleRepeats().getHost() + configuration.getDownload().getSimpleRepeats() - .getFiles().get(SIMPLE_REPEATS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); - - // Download genomic super duplications - url = configuration.getDownload().getGenomicSuperDups().getHost() + configuration.getDownload().getGenomicSuperDups() - .getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); - - // Download WindowMasker - if (!pathParam.equalsIgnoreCase(HG19_NAME)) { - url = configuration.getDownload().getWindowMasker().getHost() + configuration.getDownload().getWindowMasker().getFiles() - .get(WINDOW_MASKER_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); - } - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); - return downloadFiles; - } - return Collections.emptyList(); - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index b2c102a10e..efb94227d4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -18,7 +18,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; @@ -37,32 +37,37 @@ public MissenseScoresDownloadManager(String species, String assembly, Path targe @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + DownloadFile downloadFile = null; + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), MISSENSE_VARIATION_SCORE_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); - DownloadFile downloadFile = downloadRevel(); + downloadFile = downloadRevel(); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + } return Collections.singletonList(downloadFile); } public DownloadFile downloadRevel() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); - if (!speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { - logger.info("{}/{} not supported for species {}", getDataCategory(REVEL_DATA), getDataName(REVEL_DATA), - speciesConfiguration.getScientificName()); - return null; - } + DownloadFile downloadFile = null; + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - // Create the REVEL download path - Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); - Files.createDirectories(revelDownloadPath); + // Check if the species is supported + if (configuration.getDownload().getRevel().getFiles().containsKey(prefixId + REVEL_FILE_ID)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); - // Download REVEL and save data source - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_DATA, - revelDownloadPath); + // Create the REVEL download path + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Files.createDirectories(revelDownloadPath); - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + // Download REVEL and save data source + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getRevel(), prefixId + REVEL_FILE_ID, REVEL_DATA, + revelDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + } return downloadFile; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 53ff518323..c3048c554c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -18,6 +18,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; @@ -30,6 +31,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; + public class OntologyDownloadManager extends AbstractDownloadManager { private static final String DATA_VERSION_FIELD = "data-version:"; @@ -40,51 +42,57 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec } public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); - - Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); - Files.createDirectories(oboFolder); - - DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); - // HPO - downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); - String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); - downloadFiles.add(downloadFile); - - // GO - downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(GO_OBO_DATA))); - downloadFiles.add(downloadFile); - - // DOID - downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); - downloadFiles.add(downloadFile); - - // Mondo - downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); - downloadFiles.add(downloadFile); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + // Check if the species has the data to download + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), ONTOLOGY_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + + Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); + Files.createDirectories(oboFolder); + + DownloadFile downloadFile; + + // HPO + downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); + String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); + downloadFiles.add(downloadFile); + + // GO + downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(GO_OBO_DATA))); + downloadFiles.add(downloadFile); + + // DOID + downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); + downloadFiles.add(downloadFile); + + // Mondo + downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); + downloadFiles.add(downloadFile); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + } + return downloadFiles; } private String getVersionFromOboFile(Path oboPath) throws CellBaseException, IOException { - String version = null; if (!oboPath.toFile().exists()) { throw new CellBaseException("OBO file " + oboPath + " does not exit"); } + + String version = null; try (BufferedReader reader = FileUtils.newBufferedReader(oboPath)) { String line; while ((line = reader.readLine()) != null) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 25ad390650..649c580493 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -19,13 +19,13 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; -import java.util.Map; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -38,32 +38,36 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); + List downloadFiles = new ArrayList<>(); - Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); - Files.createDirectories(pharmgkbDownloadFolder); + // Check if the species has the data to download + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PHARMGKB_DATA)) { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); - DownloadProperties.URLProperties pharmGKBProps = configuration.getDownload().getPharmGKB(); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Files.createDirectories(pharmgkbDownloadFolder); - List urls = new ArrayList<>(); - List downloadFiles = new ArrayList<>(); - String host = pharmGKBProps.getHost(); - for (Map.Entry entry : pharmGKBProps.getFiles().entrySet()) { - String url = host + entry.getValue(); - urls.add(url); + DownloadProperties.URLProperties pharmGKBConfig = configuration.getDownload().getPharmGKB(); - Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); - DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); - logger.info(OK_LOG_MESSAGE); - downloadFiles.add(downloadFile); - } + List urls = new ArrayList<>(); + for (String fileName : pharmGKBConfig.getFiles().values()) { + String url = pharmGKBConfig.getHost() + fileName; + urls.add(url); - // Save data source - saveDataSource(PHARMGKB_DATA, pharmGKBProps.getVersion(), getTimeStamp(), urls, - pharmgkbDownloadFolder.resolve(getDataVersionFilename(PHARMGKB_DATA))); + Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); + DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); + logger.info(OK_LOG_MESSAGE); + downloadFiles.add(downloadFile); + } + + // Save data source + saveDataSource(PHARMGKB_DATA, pharmGKBConfig.getVersion(), getTimeStamp(), urls, + pharmgkbDownloadFolder.resolve(getDataVersionFilename(PHARMGKB_DATA))); + + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); + } - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index ba75a8e162..d32d3100be 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -18,12 +18,12 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -44,30 +44,34 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws CellBaseException if there is an error in the CelllBase configuration file */ public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); - if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { - logger.info("{} not supported for the species {}", getDataName(PROTEIN_DATA), speciesConfiguration.getScientificName()); - return Collections.emptyList(); - } - Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); - Files.createDirectories(proteinFolder); - - DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); - // Uniprot - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_DATA, proteinFolder); - downloadFiles.add(downloadFile); + // Check if the species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PROTEIN_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); + Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); + Files.createDirectories(proteinFolder); + + DownloadFile downloadFile; - // InterPro - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_DATA, proteinFolder); - downloadFiles.add(downloadFile); + // Uniprot + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), + UNIPROT_FILE_ID, UNIPROT_DATA, proteinFolder); + downloadFiles.add(downloadFile); - // Intact - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_DATA, proteinFolder); - downloadFiles.add(downloadFile); + // InterPro + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), + INTERPRO_FILE_ID, INTERPRO_DATA, proteinFolder); + downloadFiles.add(downloadFile); + + // Intact + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), + INTACT_FILE_ID, INTACT_DATA, proteinFolder); + downloadFiles.add(downloadFile); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); + } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 0c87775f5c..a08fd8c600 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -18,12 +18,12 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -40,21 +40,21 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); - if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { - logger.info("{} not supported for the species {}", getDataName(REGULATION_DATA), speciesConfiguration.getScientificName()); - return Collections.emptyList(); - } - regulationFolder = downloadFolder.resolve(REGULATION_DATA); - Files.createDirectories(regulationFolder); - List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadRegulatoryaAndMotifFeatures()); - downloadFiles.add(downloadMiRTarBase()); - downloadFiles.add(downloadMirna()); + // Check if species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REGULATION_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); + regulationFolder = downloadFolder.resolve(REGULATION_DATA); + Files.createDirectories(regulationFolder); + + downloadFiles.addAll(downloadRegulatoryaAndMotifFeatures()); + downloadFiles.add(downloadMiRTarBase()); + downloadFiles.add(downloadMirna()); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); + } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); return downloadFiles; } @@ -78,11 +78,13 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept regulationFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); + // And now the index file downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID, null, regulationFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); + // Save data source (name, category, version,...) saveDataSource(MOTIF_FEATURES_DATA, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), urls, regulationFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); @@ -101,12 +103,16 @@ private DownloadFile downloadMirna() throws IOException, InterruptedException, C } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + DownloadFile downloadFile = null; + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (configuration.getDownload().getMiRTarBase().getFiles().containsKey(prefixId + MIRTARBASE_FILE_ID)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, - MIRTARBASE_DATA, regulationFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), + prefixId + MIRTARBASE_FILE_ID, MIRTARBASE_DATA, regulationFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + } return downloadFile; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java index 7c0f1c0c94..20bacf80be 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java @@ -37,13 +37,15 @@ public SpliceScoreDownloadManager(String species, String assembly, Path outdir, @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(SPLICE_SCORE_DATA)); + // Check if the species is supported if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("{} not supported for the species {}", getDataName(SPLICE_SCORE_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(SPLICE_SCORE_DATA)); + // Create splice score directory Path spliceScorePath = downloadFolder.resolve(SPLICE_SCORE_DATA).toAbsolutePath(); Files.createDirectories(spliceScorePath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/InstallManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/InstallManager.java deleted file mode 100644 index d6192b3059..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/InstallManager.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.install; - -import org.opencb.cellbase.core.common.Species; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.opencb.cellbase.lib.db.MongoDBManager; -import org.opencb.commons.datastore.mongodb.MongoDataStore; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; - -public class InstallManager { - - private CellBaseConfiguration configuration; - private Logger logger; - - public InstallManager(CellBaseConfiguration configuration) { - this.configuration = configuration; - - logger = LoggerFactory.getLogger(this.getClass()); - } - - /** - * Add shard indexes and ranges in Mongo based on config file entries. - * - * @param speciesName name of species - * @param assemblyName name of assembly - * @throws CellBaseException if invalid input - */ - public void install(String speciesName, String assemblyName) throws CellBaseException { - // TDDO check database credentials - - // user API perms - - // check repl sets - - Species species = SpeciesUtils.getSpecies(configuration, speciesName, assemblyName); - - SpeciesConfiguration speciesConfiguration = configuration.getSpeciesConfig(species.getId()); - if (speciesConfiguration == null) { - LoggerFactory.getLogger(MongoDBShardUtils.class).warn("No config found for '" + species.getId() + "'"); - return; - } - - List shards = speciesConfiguration.getShards(); - if (shards != null) { - // if sharding in config - shard(species); - } - } - - private void shard(Species species) throws CellBaseException { - MongoDBManager mongoDBManager = new MongoDBManager(configuration); - MongoDataStore mongoDBDatastore = mongoDBManager.createMongoDBDatastore(species.getId(), species.getAssembly()); - MongoDBShardUtils.shard(mongoDBDatastore, configuration, species); - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/MongoDBShardUtils.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/MongoDBShardUtils.java deleted file mode 100644 index bb96933be5..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/install/MongoDBShardUtils.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.install; - -import com.mongodb.client.MongoClient; -import com.mongodb.client.MongoDatabase; -import org.apache.commons.lang.StringUtils; -import org.bson.Document; -import org.opencb.cellbase.core.common.Species; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.MongoDBDatabaseCredentials; -import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.commons.datastore.core.ObjectMap; -import org.opencb.commons.datastore.mongodb.MongoDataStore; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class MongoDBShardUtils { - - /** - * Add shards. - * - * @param mongoDataStore Database name - * @param cellBaseConfiguration config file with database details. - * @param species the species name and assembly for the database being sharded - * @throws CellBaseException if configuration isn't valid - */ - public static void shard(MongoDataStore mongoDataStore, CellBaseConfiguration cellBaseConfiguration, Species species) - throws CellBaseException { - SpeciesConfiguration speciesConfiguration = cellBaseConfiguration.getSpeciesConfig(species.getId()); - if (speciesConfiguration == null) { - LoggerFactory.getLogger(MongoDBShardUtils.class).warn("No config found for '" + species.getId() + "'"); - return; - } - - List shards = speciesConfiguration.getShards(); - if (shards == null) { - LoggerFactory.getLogger(MongoDBShardUtils.class).error("No sharding config found for '" + species.getId() + "'"); - return; - } - - for (SpeciesConfiguration.ShardConfig shardConfig : shards) { - // create the collection, if it's there already do nothing - String collectionName = createCollection(mongoDataStore, shardConfig); - - // set the keymap, e.g. chromosome, start, end. Also can be a single key - Map keyMap = createKeyMap(shardConfig); - - // shard keys must be indexed FIRST - createIndex(mongoDataStore, keyMap, collectionName); - - String databaseName = mongoDataStore.getDatabaseName(); - String fullCollectionName = mongoDataStore.getDatabaseName() + "." + collectionName; - MongoClient mongoClient = mongoDataStore.getMongoClient(); - MongoDatabase adminDB = mongoClient - .getDatabase(cellBaseConfiguration.getDatabases().getMongodb().getOptions().get("authenticationDatabase")); - - // sh.enableSharding( "cellbase_hsapiens_grch37_v4" ) -// adminDB.runCommand(new Document("enableSharding", databaseName)); - - // sh.shardCollection("cellbase_hsapiens_grch37_v4.variation", { "chromosome": 1, "start": 1, "end": 1 } ) - adminDB.runCommand(new Document("shardcollection", fullCollectionName).append("key", new Document(keyMap))); - - MongoDBDatabaseCredentials databaseCredentials = cellBaseConfiguration.getDatabases().getMongodb(); - List replicaSets = databaseCredentials.getShards(); - - if (replicaSets == null || replicaSets.isEmpty()) { - LoggerFactory.getLogger(MongoDBShardUtils.class).warn("No replicaset config found for '" + species.getId() + "'"); - return; - } - - // different from our shard key, this is the key used for the zones ONLY - final String rangeKey = shardConfig.getRangeKey(); - - int i = 0; - for (SpeciesConfiguration.Zone zone : shardConfig.getZones()) { - MongoDBDatabaseCredentials.ReplicaSet replicaSet = replicaSets.get(i++); - - // sh.addShard( "rs0/cb-mongo-shard1-1:27017,cb-mongo-shard1-2:27017,cb-mongo-shard1-3:27017" ) -// String replicaSetName = replicaSet.getId() + "/" + replicaSet.getNodes(); -// adminDB.runCommand(new Document("addShard", replicaSetName)); - - // sh.addShardToZone("rs0", "zone0") - adminDB.runCommand(new Document("addShardToZone", replicaSet.getId()).append("zone", zone.getName())); - - // put chromosome 1 in shard0 - //sh.addTagRange("cellbase_hsapiens_grch37_v4.variation", { "chromosome" : "1" }, { "chromosome" : "10" }, "zone0" ) - List shardRanges = zone.getShardRanges(); - for (SpeciesConfiguration.ShardRange shardRange : shardRanges) { - adminDB.runCommand(new Document("updateZoneKeyRange", fullCollectionName) - .append("min", new Document(rangeKey, shardRange.getMinimum())) - .append("max", new Document(rangeKey, shardRange.getMaximum())) - .append("zone", zone.getName())); - } - } - } - } - - private static String createCollection(MongoDataStore mongoDataStore, SpeciesConfiguration.ShardConfig shardConfig) - throws CellBaseException { - String collectionName = shardConfig.getCollection(); - if (StringUtils.isEmpty(collectionName)) { - throw new CellBaseException("Sharding failed: collection name not found in config"); - } - if (mongoDataStore.getCollection(collectionName) == null) { - mongoDataStore.createCollection(collectionName); - } - return collectionName; - } - - private static void createIndex(MongoDataStore mongoDataStore, Map keyMap, String collectionName) { - HashMap options = new HashMap<>(); - options.put("background", "true"); - Map indexes = new HashMap<>(); - indexes.put("fields", new ObjectMap((Map) keyMap)); - indexes.put("options", new ObjectMap((Map) options)); - // FIXME We need to correctly call to MongoDBIndexUtils -// MongoDBIndexUtils mongoDBIndexUtils = new MongoDBIndexUtils(mongoDataStore, null); -// MongoDBIndexUtils.createIndexes(mongoDataStore, Collections.singletonList(indexes), false); - } - - private static Map createKeyMap(SpeciesConfiguration.ShardConfig shardConfig) { - List keys = shardConfig.getKey(); - Map keyMap = new HashMap<>(); - for (String key : keys) { - keyMap.put(key, 1); - } - return keyMap; - } -} diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java index b15925e7a3..629d491542 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java @@ -2,6 +2,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.nio.file.Path; import java.nio.file.Paths; @@ -13,7 +14,7 @@ public void testGeneBuilder() throws Exception { Path buildPath = Paths.get("/home/jtarraga/data/cellbase/cb6/v6.1.0-dr1/homo_sapiens_grch38/generated_json/gene"); boolean flexibleGTFParsing = false; CellBaseConfiguration configuration = CellBaseConfiguration.load(Paths.get("/home/jtarraga/appl/cellbase/build/conf/configuration.yml")); - SpeciesConfiguration speciesConfiguration = configuration.getSpeciesConfig("hsapiens"); + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, "hsapiens"); GeneBuilder geneBuilder = new GeneBuilder(downloadPath, buildPath, speciesConfiguration, flexibleGTFParsing, configuration); geneBuilder.check(); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java index 798c1a29db..83af84b232 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java @@ -17,29 +17,19 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInstance; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gtf.Gtf; import org.opencb.biodata.models.core.*; import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Set; @@ -49,7 +39,7 @@ public class GeneBuilderTest { private GeneBuilder geneParser; private ObjectMapper jsonObjectMapper; - private static final SpeciesConfiguration SPECIES = new SpeciesConfiguration("hsapiens", "Homo sapiens", "human", null, null, null); + private static final SpeciesConfiguration SPECIES = new SpeciesConfiguration("hsapiens", "Homo sapiens", "human", null, null); public GeneBuilderTest() { } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java index 9ab36de70a..3507f69cca 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderTest.java @@ -59,8 +59,7 @@ public void init() throws Exception { // put the results in /tmp CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "refseq", true); - SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", - "human", null, null, null); + SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", "human", null, null); CellBaseConfiguration configuration = CellBaseConfiguration.load(configurationPath); geneParser = new RefSeqGeneBuilder(geneDirectoryPath, species, configuration, serializer); geneParser.parse(); From aaec0657213deeb41c1ae8e2f4ae9b433b343be1 Mon Sep 17 00:00:00 2001 From: imedina Date: Tue, 2 Jul 2024 03:40:04 +0100 Subject: [PATCH 079/148] * Add new ensembl_canonical.pl * Split GenomeDownloader in Conservation and Repeats --- .../ensembl-scripts/ensembl-canonical.pl | 45 +++++++++ .../ensembl-scripts/martURLLocation.xml | 19 ++++ .../lib/download/GenomeDownloadManager.java | 81 ++++++++++++++++ .../lib/download/RepeatsDownloadManager.java | 97 +++++++++++++++++++ 4 files changed, 242 insertions(+) create mode 100755 cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl create mode 100644 cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java diff --git a/cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl b/cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl new file mode 100755 index 0000000000..5066ccd3ad --- /dev/null +++ b/cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl @@ -0,0 +1,45 @@ +#!/usr/bin/env perl + +# An example script demonstrating the use of BioMart API. +# This perl API representation is only available for configuration versions >= 0.5 +use strict; +use BioMart::Initializer; +use BioMart::Query; +use BioMart::QueryRunner; + +## Default values +my $species = 'hsapiens'; +my $outdir = "./"; + +## Parsing command line +GetOptions ('species=s' => \$species, 'outdir=s' => \$outdir); + + +my $confFile = "/opt/cellbase/scripts/ensembl-scripts/martURLLocation.xml"; + +# NB: change action to 'clean' if you wish to start a fresh configuration +# and to 'cached' if you want to skip configuration step on subsequent runs from the same registry +my $action='clean'; +my $initializer = BioMart::Initializer->new('registryFile'=>$confFile, 'action'=>$action); +my $registry = $initializer->getRegistry; + +my $query = BioMart::Query->new('registry'=>$registry,'virtualSchemaName'=>'default'); + +$query->setDataset($species."_gene_ensembl"); + +$query->addAttribute("ensembl_gene_id"); +$query->addAttribute("ensembl_transcript_id"); +$query->addAttribute("transcript_is_canonical"); + +$query->formatter("TSV"); + +open (ENSEMBL_CANONICAL, ">$outdir/ensembl_canonical.txt") || die "Cannot open ensembl_canonical.txt file"; + +my $query_runner = BioMart::QueryRunner->new(); + +# to obtain unique rows only +$query_runner->uniqueRowsOnly(1); +$query_runner->execute($query); +#$query_runner->printHeader(); +print ENSEMBL_CANONICAL $query_runner->printResults(); +#$query_runner->printFooter(); diff --git a/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml b/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml new file mode 100644 index 0000000000..a710368f8f --- /dev/null +++ b/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml @@ -0,0 +1,19 @@ + + + + + \ No newline at end of file diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java new file mode 100644 index 0000000000..4aee44f533 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -0,0 +1,81 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.commons.exec.Command; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class GenomeDownloadManager extends AbstractDownloadManager { + + public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException, CellBaseException { + downloadGenomeInfo(); + return downloadReferenceGenome(); + } + + public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); + Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); + Files.createDirectories(sequenceFolder); + + // Reference genome sequences are downloaded from Ensembl + // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + DownloadFile downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PRIMARY_FA_FILE_ID, + sequenceFolder); + + // Save data source + saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA))); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); + + return Collections.singletonList(downloadFile); + } + + public void downloadGenomeInfo() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); + Files.createDirectories(sequenceFolder); + + String s = "docker run --mount type=bind,source=\"" + sequenceFolder.toAbsolutePath() + "\",target=\"/tmp\" " + + "opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl " + + "--species \"Homo sapiens\" --outfile \"/tmp/genome_info.json\""; + logger.info(s); + logger.info(sequenceFolder.toAbsolutePath().toString()); + Command command = new Command(s); + command.run(); + + // FIXME Joaquin please use DockerUtils. +// DockerUtils.run("opencb/cellbase-builder:6.2.0-SNAPSHOT", sequenceFolder.toAbsolutePath(), "/tmp" ) + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + } + +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java new file mode 100644 index 0000000000..0122893833 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java @@ -0,0 +1,97 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class RepeatsDownloadManager extends AbstractDownloadManager { + + public RepeatsDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException, CellBaseException { + return downloadRepeats(); + } + + public List downloadRepeats() throws IOException, InterruptedException, CellBaseException { + List downloadFiles = new ArrayList<>(); + + // Check if species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REPEATS_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + + Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); + Files.createDirectories(repeatsFolder); + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + // Download tandem repeat finder + if (configuration.getDownload().getSimpleRepeats().getFiles().containsKey(prefixId + SIMPLE_REPEATS_FILE_ID)) { + String url = configuration.getDownload().getSimpleRepeats().getHost() + + configuration.getDownload().getSimpleRepeats().getFiles().get(prefixId + SIMPLE_REPEATS_FILE_ID); + Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); + } + + // Download WindowMasker + if (configuration.getDownload().getWindowMasker().getFiles().containsKey(prefixId + WINDOW_MASKER_FILE_ID)) { + String url = configuration.getDownload().getWindowMasker().getHost() + + configuration.getDownload().getWindowMasker().getFiles().get(prefixId + WINDOW_MASKER_FILE_ID); + Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); + } + + // Download genomic super duplications + if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + GENOMIC_SUPER_DUPS_FILE_ID)) { + String url = configuration.getDownload().getGenomicSuperDups().getHost() + + configuration.getDownload().getGenomicSuperDups().getFiles().get(prefixId + GENOMIC_SUPER_DUPS_FILE_ID); + Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); + } + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); + } + + return downloadFiles; + } +} From 694b81d9af3bc7810847059bfc02c512a7fb8a9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 2 Jul 2024 20:49:22 +0200 Subject: [PATCH 080/148] lib: use DockerUtils to execute Perl script from docker image, #TASK-5775, #TASK-5564 On branch TASK-5564 Changes to be committed: modified: cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java --- .../lib/download/GenomeDownloadManager.java | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 4aee44f533..ee67078f84 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -16,13 +16,15 @@ package org.opencb.cellbase.lib.download; +import org.opencb.cellbase.core.common.GitRepositoryState; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.commons.exec.Command; +import org.opencb.commons.utils.DockerUtils; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.AbstractMap; import java.util.Collections; import java.util.List; @@ -60,21 +62,29 @@ public List downloadReferenceGenome() throws IOException, Interrup return Collections.singletonList(downloadFile); } - public void downloadGenomeInfo() throws IOException, InterruptedException, CellBaseException { + public void downloadGenomeInfo() throws IOException, CellBaseException { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); Files.createDirectories(sequenceFolder); - String s = "docker run --mount type=bind,source=\"" + sequenceFolder.toAbsolutePath() + "\",target=\"/tmp\" " - + "opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl " - + "--species \"Homo sapiens\" --outfile \"/tmp/genome_info.json\""; - logger.info(s); - logger.info(sequenceFolder.toAbsolutePath().toString()); - Command command = new Command(s); - command.run(); + String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); + try { + // Build command line to run Perl script via docker image + // Output binding + AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( + sequenceFolder.toAbsolutePath().toString(), "/tmp"); + + // Params + String params = "/opt/cellbase/scripts/ensembl-scripts/genome_info.pl" + + " --species \"Homo sapiens\"" + + " --outfile \"" + outputBinding.getValue() + "/genome_info.json\""; + + // Execute perl script in docker + DockerUtils.run(dockerImage, null, outputBinding, params, null); + } catch (Exception e) { + throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + } - // FIXME Joaquin please use DockerUtils. -// DockerUtils.run("opencb/cellbase-builder:6.2.0-SNAPSHOT", sequenceFolder.toAbsolutePath(), "/tmp" ) logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); } From c8e719aa887c286127bfc83fdf7f12fcd32433f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 3 Jul 2024 13:19:54 +0200 Subject: [PATCH 081/148] test: update JUnit tests, #TASK-5564 On branch TASK-5564 Changes to be committed: modified: cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java modified: cellbase-core/src/test/resources/configuration.yml modified: cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java modified: cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java modified: cellbase-lib/src/test/resources/configuration.test.yaml new file: cellbase-lib/src/test/resources/conservation/gerpVersion.json new file: cellbase-lib/src/test/resources/conservation/phastConsVersion.json new file: cellbase-lib/src/test/resources/conservation/phyloPVersion.json new file: cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json new file: cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json modified: cellbase-lib/src/test/resources/repeats/repeats.test.json.gz renamed: cellbase-lib/src/test/resources/repeats/windowMasker.txt.gz -> cellbase-lib/src/test/resources/repeats/windowmaskerSdust.txt.gz modified: pom.xml --- .../config/CellBaseConfigurationTest.java | 2 +- .../src/test/resources/configuration.yml | 408 ++++++++++++++---- .../lib/builders/ConservationBuilderTest.java | 5 +- .../lib/builders/RepeatsBuilderTest.java | 18 +- .../test/resources/configuration.test.yaml | 382 ++++++++++++---- .../resources/conservation/gerpVersion.json | 1 + .../conservation/phastConsVersion.json | 1 + .../resources/conservation/phyloPVersion.json | 1 + .../regulation/motifFeaturesVersion.json | 1 + .../regulation/regulatoryBuildVersion.json | 1 + .../resources/repeats/repeats.test.json.gz | Bin 502 -> 484 bytes ...Masker.txt.gz => windowmaskerSdust.txt.gz} | Bin pom.xml | 2 + 13 files changed, 637 insertions(+), 185 deletions(-) create mode 100644 cellbase-lib/src/test/resources/conservation/gerpVersion.json create mode 100644 cellbase-lib/src/test/resources/conservation/phastConsVersion.json create mode 100644 cellbase-lib/src/test/resources/conservation/phyloPVersion.json create mode 100644 cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json create mode 100644 cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json rename cellbase-lib/src/test/resources/repeats/{windowMasker.txt.gz => windowmaskerSdust.txt.gz} (100%) diff --git a/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java b/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java index 75bc8c2104..29546c02ad 100644 --- a/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java +++ b/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java @@ -41,7 +41,7 @@ public void defaultOutdir() { @Test public void vertebrates() { - Assertions.assertEquals(9, cellBaseConfiguration.getSpecies().getVertebrates().size()); + Assertions.assertEquals(11, cellBaseConfiguration.getSpecies().getVertebrates().size()); } @Test diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 9031275f2b..0937ea7081 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -1,4 +1,5 @@ -version: ${CELLBASE.VERSION} + +version: "${CELLBASE.VERSION}" apiVersion: "${project.version}" wiki: https://github.com/opencb/cellbase/wiki maintenanceFlagFile: "/tmp/maintenance" @@ -8,24 +9,29 @@ logDir: "./logs" # where to output the logs # can be "console" or "file", defaults to console logOutput: "file" +# For testing secretKey: "xPacig89igHSieEnveJEi4KCfdEslhmssC3vui1JJQGgDQ0y8v" databases: mongodb: - host: "${CELLBASE.DB.MONGODB.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" + host: "${JUNIT.CELLBASE.DB.MONGODB.HOST}" + user: "${JUNIT.CELLBASE.DB.USER}" + password: "${JUNIT.CELLBASE.DB.PASSWORD}" options: - authenticationDatabase: "${CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" - readPreference: "${CELLBASE.DB.MONGODB.READPREFERENCE}" - replicaSet: "${CELLBASE.DB.MONGODB.REPLICASET}" + authenticationDatabase: "${JUNIT.CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" + authenticationMechanism: "${JUNIT.CELLBASE.DB.MONGODB.AUTHENTICATION_MECHANISM}" + readPreference: "${JUNIT.CELLBASE.DB.MONGODB.READPREFERENCE}" + replicaSet: "${JUNIT.CELLBASE.DB.MONGODB.REPLICASET}" connectionsPerHost: 20 sslEnabled: false - enableSharding: true + # sslInvalidCertificatesAllowed: true + # sslInvalidHostnameAllowed: true + enableSharding: false server: rest: - port: 9090 + port: 9090 #"${JUNIT.CELLBASE.SERVER.REST.PORT}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 @@ -33,7 +39,27 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: https://ftp.ensembl.org/pub/ + files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + # To be generated manually + DESCRIPTION: "manual@description.txt" + # To be generated manually + XREFS: "manual@xrefs.txt" + # To be downloaded manually + HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" + # To be downloaded manually + TSO500: "manual@TSO500_transcripts.txt" + # To be downloaded manually + CANONICAL: "manual@ensembl_canonical.txt" + ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -42,141 +68,335 @@ download: libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub + refSeq: + host: https://ftp.ncbi.nih.gov/refseq/ + version: "2023-10-11" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + MMUSCULUS_GENOMIC_GTF: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.gtf.gz + MMUSCULUS_GENOMIC_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.fna.gz + MMUSCULUS_PROTEIN_FAA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_protein.faa.gz + MMUSCULUS_RNA_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_rna.fna.gz + RNORVEGICUS_GENOMIC_GTF: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.gtf.gz + RNORVEGICUS_GENOMIC_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.fna.gz + RNORVEGICUS_PROTEIN_FAA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_protein.faa.gz + RNORVEGICUS_RNA_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_rna.fna.gz + BTAURUS_GENOMIC_GTF: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.gtf.gz + BTAURUS_GENOMIC_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz + BTAURUS_PROTEIN_FAA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_protein.faa.gz + BTAURUS_RNA_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_rna.fna.gz + maneSelect: + host: https://ftp.ncbi.nlm.nih.gov/refseq/ + version: "1.2" + files: + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz + lrg: + host: http://ftp.ebi.ac.uk/ + version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2024-04-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://old.dgidb.org/ + version: "2022-02-01" + files: + DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + version: "2024-03-27" + files: + UNIPROT_XREF: HUMAN_9606_idmapping_selected.tab.gz + MMUSCULUS_UNIPROT_XREF: MOUSE_10090_idmapping_selected.tab.gz + RNORVEGICUS_UNIPROT_XREF: RAT_10116_idmapping_selected.tab.gz + DRERIO_UNIPROT_XREF: DANRE_7955_idmapping_selected.tab.gz + DMELOANOGASTER_UNIPROT_XREF: DROME_7227_idmapping_selected.tab.gz + SCEREVISIAE_UNIPROT_XREF: YEAST_559292_idmapping_selected.tab.gz + CELEGANS_UNIPROT_XREF: CAEEL_6239_idmapping_selected.tab.gz geneExpressionAtlas: - host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ + version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations/ + version: "2024-04-26" + files: + HPO: "manual@phenotype_to_genes.txt" + disgenet: + host: https://www.disgenet.org/ + version: "7.0 (January 2020)" + files: + DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz + goAnnotation: + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz + MMUSCULUS_GO_ANNOTATION: gene-associations/mgi.gaf.gz + cancerGeneCensus: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/census/ + version: "v99" + files: + CANCER_GENE_CENSUS: "manual@cancer-gene-census.tsv" + + ## Regulation mirbase: - host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz - mirbaseReadme: - host: ftp://mirbase.org/pub/mirbase/CURRENT/README + host: https://www.mirbase.org/ + version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: http://mirtarbase.cuhk.edu.cn/cache/download/8.0/hsa_MTI.xlsx + host: https://mirtarbase.cuhk.edu.cn/ + version: "9.0" + files: + # This file contains errors and has to be fixed before building + # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx + RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx + + ## Protein Data uniprot: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - uniprotRelNotes: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - intact: - host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt + host: https://ftp.uniprot.org/ + version: "2024-03-27" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz - interproRelNotes: - host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt - conservation: - host: ftp://hgdownload.cse.ucsc.edu/goldenPath/ + host: https://ftp.ebi.ac.uk/ + version: "2024-03-27" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz + intact: + host: https://ftp.ebi.ac.uk/ + version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt + + ## Conservation Scores + phastCons: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHASTCONS: goldenPath/hg38/phastCons470way/hg38.470way.phastCons/ + MMUSCULUS_PHASTCONS: goldenPath/mm39/phastCons35way/mm39.35way.phastCons/ + phylop: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHYLOP: goldenPath/hg38/phyloP470way/hg38.470way.phyloP/ + MMUSCULUS_PHYLOP: goldenPath/mm39/phyloP35way/mm39.35way.phyloP/ gerp: - host: ftp://ftp.ensembl.org/pub/current_compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + host: http://ftp.ensembl.org/ + version: "2023-05-17" + files: + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + MMUSCULUS_GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.mus_musculus.GRCm39.bw + + ## Clinical Variant clinvar: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2020-02.xml.gz - clinvarSummary: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - clinvarVariationAllele: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - clinvarEfoTerms: - host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv - iarctp53: - host: http://p53.iarc.fr/ajax/Zipper.ashx - docm: - host: http://docm.info/api/ - docmVersion: - host: http://docm.info + host: https://ftp.ncbi.nlm.nih.gov/ + version: "2024-02" + files: + FULL_RELEASE: pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz + SUMMARY: pub/clinvar/tab_delimited/variant_summary.txt.gz + ALLELE: pub/clinvar/tab_delimited/variation_allele.txt.gz + EFO_TERMS: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cosmic: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/cosmic/ + version: "v99" + files: + COSMIC: CosmicMutantExport.tsv.gz + hgmd: + ## To be downloaded manually + host: https://www.hgmd.cf.ac.uk/ + version: "2020-03" + files: + HGMD: hgmd.vcf + gwasCatalog: + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' + host: https://ftp.ebi.ac.uk/ + version: "2024-04-22" + files: + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv + DBSNP: All.vcf.gz + pharmGKB: + host: https://api.pharmgkb.org/v1/download/file/data/ + version: v1 + files: + GENES: genes.zip + CHEMICALS: chemicals.zip + VARIANTS: variants.zip + GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip + VARIANT_ANNOTATIONS: variantAnnotations.zip + CLINICAL_ANNOTATIONS: clinicalAnnotations.zip + CLINICAL_VARIANTS: clinicalVariants.zip + DRUG_LABELS: drugLabels.zip + RELATIONSHIPS: relationships.zip + dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + SIMPLE_REPEATS: goldenPath/hg38/database/simpleRepeat.txt.gz + MMUSCULUS_SIMPLE_REPEATS: goldenPath/mm39/database/simpleRepeat.txt.gz windowMasker: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + WINDOW_MASKER: goldenPath/hg38/database/windowmaskerSdust.txt.gz + MMUSCULUS_WINDOW_MASKER: goldenPath/mm39/database/windowmaskerSdust.txt.gz genomicSuperDups: - host: http://hgdownload.cse.ucsc.edu/goldenPath - gwasCatalog: - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2016/09/28/gwas-catalog-associations.tsv - hpo: - host: http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/util/annotation/phenotype_to_genes.txt - disgenet: - host: https://www.disgenet.org/static/disgenet_ap1/files/downloads + host: http://hgdownload.cse.ucsc.edu/ files: - - all_gene_disease_associations.tsv.gz - - readme.txt - dgidb: - host: http://dgidb.org/data/interactions.tsv + GENOMIC_SUPER_DUPS: goldenPath/hg38/database/genomicSuperDups.txt.gz + + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/ + version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip cadd: - host: http://krishna.gs.washington.edu/download/CADD/v1.3/whole_genome_SNVs.tsv.gz - reactome: - host: http://www.reactome.org/download/current/biopax.zip - gnomadConstraints: - host: https://storage.googleapis.com/gnomad-public/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: 2.1.1 + host: https://krishna.gs.washington.edu/ + version: "1.7" + files: + CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + + ## OBO Ontologies + ## The version is retrieved from the OBO file hpoObo: - host: http://purl.obolibrary.org/obo/hp.obo + host: http://purl.obolibrary.org/obo/ + files: + HPO: hp.obo goObo: - host: http://purl.obolibrary.org/obo/go/go-basic.obo + host: http://purl.obolibrary.org/obo/ + files: + GO: go/go-basic.obo doidObo: - host: http://purl.obolibrary.org/obo/doid.obo - goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz - refSeq: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz - refSeqFasta: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz - refSeqProteinFasta: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz - refSeqCdna: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz - revel: - host: https://rothsj06.u.hpc.mssm.edu/revel_grch38_all_chromosomes.csv.zip + host: http://purl.obolibrary.org/obo/ + files: + DOID: doid.obo + mondoObo: + host: http://purl.obolibrary.org/obo/ + files: + MONDO: mondo.obo + + ## Splice score + mmSplice: + host: http://kipoi.org/models/MMSplice/mtsplice/ + version: 2.0 + spliceAi: + host: https://basespace.illumina.com/s/otSPW8hnhaZR + version: 1.3.1 + + ## Others + pubmed: + host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 + files: + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz + reactome: + host: http://www.reactome.org/download/current/biopax.zip + + species: vertebrates: - id: hsapiens scientificName: Homo sapiens assemblies: - # - ensemblVersion: '82_37' - # name: GRCh37 - - ensemblVersion: '99_38' + - ensemblVersion: '111_38' name: GRCh38 + # - ensemblVersion: '82_37' + # name: GRCh37 data: - - clinical_variants + - genome - conservation + - repeats - gene - - genome + - regulation + - protein + - clinical_variant - missense_variation_functional_score - ontology - - protein - - refseq - - regulation - - repeats - variation_functional_score - splice_score + - pharmacogenomics - id: mmusculus scientificName: Mus musculus assemblies: - - ensemblVersion: '82_38' - name: GRCm38 + - ensemblVersion: '111_39' + name: GRCm39 data: - genome - - genome_info + - conservation + - repeats - gene - - variation - regulation - protein - - conservation + # - variation + - id: rnorvegicus + scientificName: Rattus norvegicus + assemblies: + - ensemblVersion: '111_7.2' + name: mRatBN7.2 + data: + - genome + - gene + - regulation + - protein + # - variation - id: drerio scientificName: Danio rerio assemblies: - - ensemblVersion: '82_10' - name: GRCz10 + - ensemblVersion: '111_11' + name: GRCz11 + data: + - genome + - gene + - regulation + - protein + # - variation + - id: btaurus + scientificName: Bos taurus + assemblies: + - ensemblVersion: '111_1.3' + name: ARS-UCD1.3 data: - genome - genome_info - gene + # - refseq + - regulation - variation - protein - - id: rnorvegicus - scientificName: Rattus norvegicus + - id: sscrofa + scientificName: Sus scrofa assemblies: - - ensemblVersion: '82_6' - name: Rnor_6.0 + - ensemblVersion: '111_11.1' + name: Sscrofa11.1 data: - genome - genome_info diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java index 6a21908c13..32386fdb0e 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java @@ -19,6 +19,7 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.core.GenomicScoreRegion; @@ -26,6 +27,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; @@ -40,10 +42,9 @@ public class ConservationBuilderTest { private final int BATCH_SIZE = 100; + @Disabled @Test public void testParse() throws Exception { - CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml")); - Path conservationDir = Paths.get(ConservationBuilderTest.class.getResource("/conservation").toURI()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gerp.test"); (new ConservationBuilder(conservationDir, BATCH_SIZE, serializer)).parse(); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java index acce1fa92b..8e27bf3f98 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java @@ -16,8 +16,6 @@ package org.opencb.cellbase.lib.builders; -import static org.junit.jupiter.api.Assertions.assertEquals; - import org.junit.jupiter.api.Test; import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.variant.avro.Repeat; @@ -28,11 +26,14 @@ import java.io.BufferedReader; import java.io.IOException; +import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashSet; import java.util.Set; +import static org.junit.jupiter.api.Assertions.*; + /** * Created by fjlopez on 10/05/17. @@ -47,13 +48,20 @@ public RepeatsBuilderTest() { @Test public void testParse() throws Exception { - CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml")); + CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getClassLoader().getResourceAsStream("configuration.test.yaml")); Path repeatsFilesDir = Paths.get(getClass().getResource("/repeats").getPath()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "repeats.test"); (new RepeatsBuilder(repeatsFilesDir, serializer, configuration)).parse(); serializer.close(); - assertEquals(loadRepeatSet(Paths.get(getClass().getResource("/repeats/repeats.test.json.gz").getFile())), - loadRepeatSet(Paths.get("/tmp/repeats.test.json.gz"))); + Set expected = loadRepeatSet(Paths.get(getClass().getClassLoader().getResource("repeats/repeats.test.json.gz").getPath())); + Set current = loadRepeatSet(Paths.get("/tmp/repeats.test.json.gz")); + assertEquals(expected.size(), current.size()); + for (Repeat repeat : expected) { + assertTrue(current.contains(repeat)); + } + for (Repeat repeat : current) { + assertTrue(expected.contains(repeat)); + } } private Set loadRepeatSet(Path path) throws IOException { diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index 237d5993b1..2f7da1b6b7 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -16,158 +16,374 @@ databases: readPreference: '' replicaSet: '' connectionsPerHost: 20 - neo4j: - hsapiens: - host: "${JUNIT.CELLBASE.DB.NEO4J.HOST}" - user: "${JUNIT.CELLBASE.DB.USER}" - password: "${JUNIT.CELLBASE.DB.PASSWORD}" - mmusculus: - host: "${JUNIT.CELLBASE.DB.NEO4J.HOST}" - user: "${JUNIT.CELLBASE.DB.USER}" - password: "${JUNIT.CELLBASE.DB.PASSWORD}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 user: anonymous password: '' - libs: "${JUNIT.CELLBASE.ENSEMBL.LIBS}" + libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: https://ftp.ensembl.org/pub/ + files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + # To be generated manually + DESCRIPTION: "manual@description.txt" + # To be generated manually + XREFS: "manual@xrefs.txt" + # To be downloaded manually + HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" + # To be downloaded manually + TSO500: "manual@TSO500_transcripts.txt" + # To be downloaded manually + CANONICAL: "manual@ensembl_canonical.txt" + ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 user: anonymous password: '' - libs: "${JUNIT.CELLBASE.ENSEMBL.LIBS}" + libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub + refSeq: + host: https://ftp.ncbi.nih.gov/refseq/ + version: "2023-10-11" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + MMUSCULUS_GENOMIC_GTF: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.gtf.gz + MMUSCULUS_GENOMIC_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.fna.gz + MMUSCULUS_PROTEIN_FAA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_protein.faa.gz + MMUSCULUS_RNA_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_rna.fna.gz + RNORVEGICUS_GENOMIC_GTF: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.gtf.gz + RNORVEGICUS_GENOMIC_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.fna.gz + RNORVEGICUS_PROTEIN_FAA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_protein.faa.gz + RNORVEGICUS_RNA_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_rna.fna.gz + BTAURUS_GENOMIC_GTF: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.gtf.gz + BTAURUS_GENOMIC_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz + BTAURUS_PROTEIN_FAA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_protein.faa.gz + BTAURUS_RNA_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_rna.fna.gz + maneSelect: + host: https://ftp.ncbi.nlm.nih.gov/refseq/ + version: "1.2" + files: + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz + lrg: + host: http://ftp.ebi.ac.uk/ + version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2024-04-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://old.dgidb.org/ + version: "2022-02-01" + files: + DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + version: "2024-03-27" + files: + UNIPROT_XREF: HUMAN_9606_idmapping_selected.tab.gz + MMUSCULUS_UNIPROT_XREF: MOUSE_10090_idmapping_selected.tab.gz + RNORVEGICUS_UNIPROT_XREF: RAT_10116_idmapping_selected.tab.gz + DRERIO_UNIPROT_XREF: DANRE_7955_idmapping_selected.tab.gz + DMELOANOGASTER_UNIPROT_XREF: DROME_7227_idmapping_selected.tab.gz + SCEREVISIAE_UNIPROT_XREF: YEAST_559292_idmapping_selected.tab.gz + CELEGANS_UNIPROT_XREF: CAEEL_6239_idmapping_selected.tab.gz geneExpressionAtlas: - host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ + version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations/ + version: "2024-04-26" + files: + HPO: "manual@phenotype_to_genes.txt" + disgenet: + host: https://www.disgenet.org/ + version: "7.0 (January 2020)" + files: + DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz + goAnnotation: + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz + MMUSCULUS_GO_ANNOTATION: gene-associations/mgi.gaf.gz + cancerGeneCensus: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/census/ + version: "v99" + files: + CANCER_GENE_CENSUS: "manual@cancer-gene-census.tsv" + + ## Regulation mirbase: - host: ftp://mirbase.org/pub/mirbase/CURRENT/ - mirbaseReadme: - host: ftp://mirbase.org/pub/mirbase/CURRENT/README + host: https://www.mirbase.org/ + version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: http://mirtarbase.mbc.nctu.edu.tw/cache/download/4.5/ + host: https://mirtarbase.cuhk.edu.cn/ + version: "9.0" + files: + # This file contains errors and has to be fixed before building + # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx + RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx + + ## Protein Data uniprot: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - uniprotRelNotes: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - intact: - host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt + host: https://ftp.uniprot.org/ + version: "2024-03-27" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz - interproRelNotes: - host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt - conservation: - host: ftp://hgdownload.cse.ucsc.edu/goldenPath/ + host: https://ftp.ebi.ac.uk/ + version: "2024-03-27" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz + intact: + host: https://ftp.ebi.ac.uk/ + version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt + + ## Conservation Scores + phastCons: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHASTCONS: goldenPath/hg38/phastCons470way/hg38.470way.phastCons/ + MMUSCULUS_PHASTCONS: goldenPath/mm39/phastCons35way/mm39.35way.phastCons/ + phylop: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHYLOP: goldenPath/hg38/phyloP470way/hg38.470way.phyloP/ + MMUSCULUS_PHYLOP: goldenPath/mm39/phyloP35way/mm39.35way.phyloP/ gerp: - host: http://mendel.stanford.edu/SidowLab/downloads/gerp/hg19.GERP_scores.tar.gz + host: http://ftp.ensembl.org/ + version: "2023-05-17" + files: + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + MMUSCULUS_GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.mus_musculus.GRCm39.bw + + ## Clinical Variant clinvar: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2017-07.xml.gz - clinvarSummary: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - clinvarVariationAllele: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - clinvarEfoTerms: - host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv - iarctp53: - host: http://p53.iarc.fr/ajax/Zipper.ashx - docm: - host: http://docm.genome.wustl.edu/api/ - docmVersion: - host: http://docm.genome.wustl.edu + host: https://ftp.ncbi.nlm.nih.gov/ + version: "2024-02" + files: + FULL_RELEASE: pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz + SUMMARY: pub/clinvar/tab_delimited/variant_summary.txt.gz + ALLELE: pub/clinvar/tab_delimited/variation_allele.txt.gz + EFO_TERMS: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cosmic: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/cosmic/ + version: "v99" + files: + COSMIC: CosmicMutantExport.tsv.gz + hgmd: + ## To be downloaded manually + host: https://www.hgmd.cf.ac.uk/ + version: "2020-03" + files: + HGMD: hgmd.vcf + gwasCatalog: + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' + host: https://ftp.ebi.ac.uk/ + version: "2024-04-22" + files: + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv + DBSNP: All.vcf.gz + pharmGKB: + host: https://api.pharmgkb.org/v1/download/file/data/ + version: v1 + files: + GENES: genes.zip + CHEMICALS: chemicals.zip + VARIANTS: variants.zip + GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip + VARIANT_ANNOTATIONS: variantAnnotations.zip + CLINICAL_ANNOTATIONS: clinicalAnnotations.zip + CLINICAL_VARIANTS: clinicalVariants.zip + DRUG_LABELS: drugLabels.zip + RELATIONSHIPS: relationships.zip + dgv: host: http://dgv.tcag.ca/v106/docs - simpleRepeats: host: http://hgdownload.cse.ucsc.edu/ files: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz + SIMPLE_REPEATS: goldenPath/hg38/database/simpleRepeat.txt.gz + MMUSCULUS_SIMPLE_REPEATS: goldenPath/mm39/database/simpleRepeat.txt.gz windowMasker: host: http://hgdownload.cse.ucsc.edu/ files: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz + WINDOW_MASKER: goldenPath/hg38/database/windowmaskerSdust.txt.gz + MMUSCULUS_WINDOW_MASKER: goldenPath/mm39/database/windowmaskerSdust.txt.gz genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/ files: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + GENOMIC_SUPER_DUPS: goldenPath/hg38/database/genomicSuperDups.txt.gz - gwasCatalog: - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2016/09/28/gwas-catalog-associations.tsv - hpo: - host: http://compbio.charite.de/hudson/job/hpo.annotations.monthly/lastStableBuild/artifact/annotation/ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt - disgenet: - host: http://www.disgenet.org/ds/DisGeNET/results/all_gene_disease_associations.tsv.gz - disgenetReadme: - host: http://www.disgenet.org/ds/DisGeNET/results/readme.txt - dgidb: - host: http://dgidb.org/data/interactions.tsv + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/ + version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip cadd: - host: http://krishna.gs.washington.edu/download/CADD/v1.3/whole_genome_SNVs.tsv.gz + host: https://krishna.gs.washington.edu/ + version: "1.7" + files: + CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + + ## OBO Ontologies + ## The version is retrieved from the OBO file + hpoObo: + host: http://purl.obolibrary.org/obo/ + files: + HPO: hp.obo + goObo: + host: http://purl.obolibrary.org/obo/ + files: + GO: go/go-basic.obo + doidObo: + host: http://purl.obolibrary.org/obo/ + files: + DOID: doid.obo + mondoObo: + host: http://purl.obolibrary.org/obo/ + files: + MONDO: mondo.obo + + ## Splice score + mmSplice: + host: http://kipoi.org/models/MMSplice/mtsplice/ + version: 2.0 + spliceAi: + host: https://basespace.illumina.com/s/otSPW8hnhaZR + version: 1.3.1 + + ## Others + pubmed: + host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 + files: + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz reactome: host: http://www.reactome.org/download/current/biopax.zip + + species: vertebrates: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '89_38' + - ensemblVersion: '111_38' name: GRCh38 - - ensemblVersion: '82_37' - name: GRCh37 + # - ensemblVersion: '82_37' + # name: GRCh37 data: - genome - - genome_info + - conservation + - repeats - gene - - gene_disease_association - - variation - - variation_functional_score - regulation - protein - - conservation - - clinical_variants - - clinical - - svs - - repeats + - clinical_variant + - missense_variation_functional_score + - ontology + - variation_functional_score + - splice_score + - pharmacogenomics - id: mmusculus scientificName: Mus musculus assemblies: - - ensemblVersion: '82_38' - name: GRCm38 + - ensemblVersion: '111_39' + name: GRCm39 data: - genome - - genome_info + - conservation + - repeats - gene - - variation - regulation - protein - - conservation + # - variation + - id: rnorvegicus + scientificName: Rattus norvegicus + assemblies: + - ensemblVersion: '111_7.2' + name: mRatBN7.2 + data: + - genome + - gene + - regulation + - protein + # - variation - id: drerio scientificName: Danio rerio assemblies: - - ensemblVersion: '82_10' - name: GRCz10 + - ensemblVersion: '111_11' + name: GRCz11 + data: + - genome + - gene + - regulation + - protein + # - variation + - id: btaurus + scientificName: Bos taurus + assemblies: + - ensemblVersion: '111_1.3' + name: ARS-UCD1.3 data: - genome - genome_info - gene + # - refseq + - regulation - variation - protein - - id: rnorvegicus - scientificName: Rattus norvegicus + - id: sscrofa + scientificName: Sus scrofa assemblies: - - ensemblVersion: '82_6' - name: Rnor_6.0 + - ensemblVersion: '111_11.1' + name: Sscrofa11.1 data: - genome - genome_info diff --git a/cellbase-lib/src/test/resources/conservation/gerpVersion.json b/cellbase-lib/src/test/resources/conservation/gerpVersion.json new file mode 100644 index 0000000000..f74cabaa9f --- /dev/null +++ b/cellbase-lib/src/test/resources/conservation/gerpVersion.json @@ -0,0 +1 @@ +{"downloadDate":"20240612_094720","name":"gerp","version":"version-11","urls":["https://toto.com/conservation_19.json.gz"]} \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/conservation/phastConsVersion.json b/cellbase-lib/src/test/resources/conservation/phastConsVersion.json new file mode 100644 index 0000000000..6b5201a8ce --- /dev/null +++ b/cellbase-lib/src/test/resources/conservation/phastConsVersion.json @@ -0,0 +1 @@ +{"downloadDate":"20240612_094720","name":"phastcons","version":"version-12","urls":["https://toto.com/empty.wigFix.gz"]} \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/conservation/phyloPVersion.json b/cellbase-lib/src/test/resources/conservation/phyloPVersion.json new file mode 100644 index 0000000000..ab917129f7 --- /dev/null +++ b/cellbase-lib/src/test/resources/conservation/phyloPVersion.json @@ -0,0 +1 @@ +{"downloadDate":"20240612_094720","name":"phylop","version":"version-11","urls":["https://toto.com/empty.wigFix.gz"]} \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json b/cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json new file mode 100644 index 0000000000..85161a2e1c --- /dev/null +++ b/cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json @@ -0,0 +1 @@ +{"downloadDate":"20240612_094720","name":"motif features","version":"version-11","urls":["https://toto.com/motif_features.gff.gz", "https://toto.com/motif_features.gff.gz.tbi"]} \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json b/cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json new file mode 100644 index 0000000000..dcafc16b10 --- /dev/null +++ b/cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json @@ -0,0 +1 @@ +{"downloadDate":"20240612_094720","name":"regulatory build","version":"version-11","urls":["https://toto.com/Regulatory_Build.regulatory_features.gff.gz"]} \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/repeats/repeats.test.json.gz b/cellbase-lib/src/test/resources/repeats/repeats.test.json.gz index 5aef8a765f95475f1a77c5bbf641038ddefae9f7..2f0f85084d4344bf2c14a0d4e67a7155be6dd6c9 100644 GIT binary patch literal 484 zcmVWnpx4E_7vcbS`RhZ*BmUl+SXTFc8M?eF~4x45Jl7 ze|*#Ez^T1NFW_P|@xhP)o=%hLyLVS{af~q`Fr$GL`+dJ}jd^4BQ`a8bzCA{^Wxz7l z54&z)Ti~2S#$vOlYpG1eo?~}t@m6K5Zks+f{i%OAykg@lYMqPJ zP{(H2J;v|5p?+dpp4$=~>bAosZEVi*F20;%QRKkpu zhVu)hu(z04yXI6mpTUI0^-ftxHKBA><>@>xw7T=^s;|N8@&a$IzQF658SCkgi_d2) z3pt^m@GgdN9?wwu;L=0E(_U4D>b)&|DJ%$~{)YuAKZFI)w`oDZBrfE3Hm(`d3SCUn zQYbtq0}(rlf*1Ing%p%B1s`Z1ixkaKJW8Kh-Pr5Wc&ap$bFUwJ1J!~HsX0A8H=4tq zMgm$%++Fj(Ga9FsT@69ZzOeY4TVcU`Fn#Rz7gKG7Sm`OaA)Qs8rr@Q#=UJE%R(gUp zQm*wB#@@}jYzUg44GRr&bx4cH9m5T|L5?n)wI5Pq7`1*Mr&3~Bs#JWGDvel_Dn5}i adX!SIOeL44uD`)IRQ~{c=X<(k1^@t{4C})H literal 502 zcmVWnpx4E_7vcbS`RhZ*BmUl*?}0Fc3xe{R)F?LEumn zAKnoRpjEmFf54U5L?|q|dLT%H{(Cu$E60kUFcd(DL(bfDM~_!hKlRaGt-Qt6zAmH$2YwnlkpjI|aSx~Y$mvHwZ_etX@Z6im&O ziqb13(;qQI?3PpHbOjS4*BfQQ#e&jQmBs5a;cPCmslEm;%PYLr@(M3kdMuYsEI)m5xn>&FJJCnqcBiI(n8M+^>3ju{im^m}Mitgf(pLp7 zd5V6aN=VcPtvFBBXG$me-xdc+FXDv+o@`!+ks!W9Wypi#F7?NYJagUWragAg_lMzE sz*!V!+eJ%3=c1)ziA9S~iYhr56_85Gw~OBX(Kkr{0hMsm6TJoi03pBgNB{r; diff --git a/cellbase-lib/src/test/resources/repeats/windowMasker.txt.gz b/cellbase-lib/src/test/resources/repeats/windowmaskerSdust.txt.gz similarity index 100% rename from cellbase-lib/src/test/resources/repeats/windowMasker.txt.gz rename to cellbase-lib/src/test/resources/repeats/windowmaskerSdust.txt.gz diff --git a/pom.xml b/pom.xml index 61e1f1d7fa..875f6a5965 100644 --- a/pom.xml +++ b/pom.xml @@ -624,6 +624,8 @@ cellbase admin SCRAM-SHA-256 + secondaryPreferred + 9090 From 19efdf4a9cd1ea881ca6729bd63785d248d3bec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 3 Jul 2024 17:32:27 +0200 Subject: [PATCH 082/148] cicd: update task.yml to deploy cellbase-builder docker, #TASK-5564 On branch TASK-5564 Changes to be committed: modified: .github/workflows/task.yml --- .github/workflows/task.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/task.yml b/.github/workflows/task.yml index ac09089acb..f71e27a41d 100644 --- a/.github/workflows/task.yml +++ b/.github/workflows/task.yml @@ -21,5 +21,5 @@ jobs: uses: opencb/java-common-libs/.github/workflows/deploy-docker-hub-workflow.yml@develop needs: test with: - cli: python3 ./build/cloud/docker/docker-build.py push --images base --tag ${{ github.ref_name }} + cli: python3 ./build/cloud/docker/docker-build.py push --images base,builder --tag ${{ github.ref_name }} secrets: inherit From fcbb68017597e380aaf0aa06cc286c24e7c60546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 4 Jul 2024 09:57:32 +0200 Subject: [PATCH 083/148] build: create the MiRTarBase parser for .xlsx files, #TASK-5576, #TASK-5564 - This parser skips possible errors in the .xlsx file - No need to use the script fix-gene-symbol.sh (so it is removed) - Update configuration files (by removing comments about the fix-gene-symbol.sh references) - Add JUnit test for the MiRTarBase parser On branch TASK-5564 Changes to be committed: deleted: cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh modified: cellbase-core/src/main/resources/configuration.yml modified: cellbase-core/src/test/resources/configuration.yml modified: cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java modified: cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java new file: cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java modified: cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java new file: cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexerTest.java modified: cellbase-lib/src/test/resources/configuration.test.yaml --- .../app/scripts/mirtarbase/fix-gene-symbol.sh | 60 ------- .../src/main/resources/configuration.yml | 2 - .../src/test/resources/configuration.yml | 2 - .../lib/builders/EnsemblGeneBuilder.java | 23 +-- .../lib/builders/GeneBuilderIndexer.java | 91 ++--------- .../lib/builders/MiRTarBaseIndexer.java | 148 ++++++++++++++++++ .../lib/builders/RefSeqGeneBuilder.java | 24 +-- .../lib/builders/GeneBuilderIndexerTest.java | 65 ++++++++ .../test/resources/configuration.test.yaml | 2 - 9 files changed, 227 insertions(+), 190 deletions(-) delete mode 100755 cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java create mode 100644 cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexerTest.java diff --git a/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh deleted file mode 100755 index 38c7d1efa2..0000000000 --- a/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# The original MirTarBase hsa_MTI.xlsx contains invalid Gene Symbols in 793 lines. -# To fix it, that file has to be converted to a CSV file, i.e.: hsa_MTI.csv -# -# After converting to CSV file, we can see the errors from the original file for the Gene Symbols (column 4), -# e.g.: 06-mar: -# MIRT050267,hsa-miR-25-3p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 -# MIRT051174,hsa-miR-16-5p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 -# -# This script fix those lines and convert the column 4 for a vaild Gene Symbol: -# -# MIRT050267,hsa-miR-25-3p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 -# MIRT051174,hsa-miR-16-5p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 - -# Check the parameters number -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -# Check CSV file -csv_file="$1" -if [ ! -f "$csv_file" ]; then - echo "CSV file '$csv_file' does not exist." - exit 1 -fi - -# Fix gene-symbol -while IFS=$'\t' read -r c1 c2 c3 c4 c5 c6 c7 c8 c9 || [[ -n "$c1" ]]; do - # Aplica las condiciones - if [ "$c5" = "10299" ]; then - c4="MARCHF6" - elif [ "$c5" = "51257" ]; then - c4="MARCHF2" - elif [ "$c5" = "54708" ]; then - c4="MARCHF5" - elif [ "$c5" = "54996" ]; then - c4="MTARC2" - elif [ "$c5" = "55016" ]; then - c4="MARCHF1" - elif [ "$c5" = "57574" ]; then - c4="MARCHF4" - elif [ "$c5" = "64757" ]; then - c4="MTARC1" - elif [ "$c5" = "64844" ]; then - c4="MARCHF7" - elif [ "$c5" = "92979" ]; then - c4="MARCHF9" - elif [ "$c5" = "115123" ]; then - c4="MARCHF3" - elif [ "$c5" = "220972" ]; then - c4="MARCHF8" - elif [ "$c5" = "441061" ]; then - c4="MARCHF11" - fi - - # Print line - echo -e "$c1\t$c2\t$c3\t$c4\t$c5\t$c6\t$c7\t$c8\t$c9" -done < "$csv_file" diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 88d8d8a9fd..d50579ffcf 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -169,8 +169,6 @@ download: host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" files: - # This file contains errors and has to be fixed before building - # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 0937ea7081..dc7901d8d5 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -169,8 +169,6 @@ download: host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" files: - # This file contains errors and has to be fixed before building - # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index e8ea728da3..03a3df949d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -40,7 +40,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; -import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -167,27 +166,7 @@ public void check() throws Exception { miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); // mirtarbase - // The downloaded .xlsx file contains errors and it has to be fixed manually - logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); - Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); - List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( - getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) - .collect(Collectors.toList()); - if (mirTarBaseFiles.size() != 1) { - throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath - + ", but currently there are " + mirTarBaseFiles.size() + " files"); - } - // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually - if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { - throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " - + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); - } - miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); - if (!Files.exists(miRTarBaseFile)) { - throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist. You" - + " have to export the file " + mirTarBaseFiles.get(0) + " to " + miRTarBaseFile.getFileName() + " format separated by" - + " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh"); - } + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); // Check genome FASTA file Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index b8941cc448..282ac9ee49 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -24,10 +24,12 @@ import org.opencb.biodata.formats.sequence.fasta.Fasta; import org.opencb.biodata.formats.sequence.fasta.io.FastaReader; import org.opencb.biodata.models.clinical.ClinicalProperty; -import org.opencb.biodata.models.core.*; +import org.opencb.biodata.models.core.CancerHotspot; +import org.opencb.biodata.models.core.CancerHotspotVariant; +import org.opencb.biodata.models.core.GeneCancerAssociation; +import org.opencb.biodata.models.core.MirnaTarget; import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; -import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; @@ -38,12 +40,12 @@ import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; -import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.EtlCommons.DISGENET_DATA; +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; @@ -608,84 +610,15 @@ protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOE } } - protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException, CellBaseException { - logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile); - - try (BufferedReader reader = Files.newBufferedReader(miRTarBaseFile)) { - String line; - // Skip header line - reader.readLine(); - - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList<>(); - Map> geneToMirna = new HashMap<>(); - - while ((line = reader.readLine()) != null) { - String[] field = line.split("\t", -1); - if (field.length != 9) { - throw new CellBaseException("Invalid number of columns " + field.length + " (expected 9 columns) parsing file " - + miRTarBaseFile + ". Line: " + line); - } - - // #0: miRTarBase ID - String miRTarBaseId = field[0]; - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - // #1: miRNA - String miRNA = field[1]; - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // #2: Species (miRNA) - - // #3: Target Gene - String geneName = field[3]; - if (currentGene == null) { - currentGene = geneName; - } - - // #4: Target Gene (Entrez ID) - // #5: Species (Target Gene) - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList<>(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // #6: Experiments - String experiment = field[6]; - - // #7: Support Type - String supportType = field[7]; - - // #8: pubmed - String pubmed = field[8]; - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } + protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { + MiRTarBaseIndexer miRTarBaseIndexer = new MiRTarBaseIndexer(); + Map> result = miRTarBaseIndexer.index(miRTarBaseFile); + for (Map.Entry> entry : result.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); } - logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile); } - protected static void addValueToMapElement(Map> map, String key, T value) { + public static void addValueToMapElement(Map> map, String key, T value) { if (map.containsKey(key)) { map.get(key).add(value); } else { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java new file mode 100644 index 0000000000..84a3adc73e --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java @@ -0,0 +1,148 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import org.apache.commons.lang3.StringUtils; +import org.apache.poi.ss.usermodel.*; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.opencb.biodata.models.core.MirnaTarget; +import org.opencb.biodata.models.core.TargetGene; +import org.opencb.commons.utils.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.math.BigDecimal; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.opencb.cellbase.lib.EtlCommons.MIRTARBASE_DATA; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + +public class MiRTarBaseIndexer { + + protected Logger logger; + + public MiRTarBaseIndexer() { + logger = LoggerFactory.getLogger(this.getClass()); + } + + public Map> index(Path miRTarBaseFile) throws IOException { + FileUtils.checkFile(miRTarBaseFile); + + logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile); + + Map> geneToMirna = new HashMap<>(); + + try (InputStream fis = new FileInputStream(miRTarBaseFile.toFile()); + Workbook workbook = new XSSFWorkbook(fis)) { + + // Get the first sheet + Sheet sheet = workbook.getSheetAt(0); + + String currentMiRTarBaseId = null; + String currentMiRNA = null; + String currentGene = null; + List targetGenes = new ArrayList<>(); + + for (int rowNum = sheet.getFirstRowNum() + 1; rowNum <= sheet.getLastRowNum(); rowNum++) { + Row row = sheet.getRow(rowNum); + + // Sanity check + if (row.getPhysicalNumberOfCells() != 9) { + logger.warn("Error parsing line {}: invalid number of columns {} (expected 9 columns). Line {}.", + rowNum + 1, row.getPhysicalNumberOfCells()); + continue; + } + + if (row.getCell(0).getCellType() != CellType.STRING || row.getCell(0).getStringCellValue() == null + || row.getCell(1).getCellType() != CellType.STRING || row.getCell(1).getStringCellValue() == null + || row.getCell(3).getCellType() != CellType.STRING || row.getCell(3).getStringCellValue() == null) { + logger.warn("Error parsing line {}: mandatory fields(miRTarBase ID, miRNA, Target Gene) are empty or wrong cell type.", + rowNum + 1); + continue; + } + + // #0: miRTarBase ID + Cell cell = row.getCell(0); + String miRTarBaseId = cell.getStringCellValue(); + if (currentMiRTarBaseId == null) { + currentMiRTarBaseId = miRTarBaseId; + } + + // #1: miRNA + cell = row.getCell(1); + String miRNA = cell.getStringCellValue(); + if (currentMiRNA == null) { + currentMiRNA = miRNA; + } + + // #2: Species (miRNA) + + // #3: Target Gene + cell = row.getCell(3); + String geneName = cell.getStringCellValue(); + if (currentGene == null) { + currentGene = geneName; + } + + // #4: Target Gene (Entrez ID) + // #5: Species (Target Gene) + + if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { + // new entry, store current one + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + GeneBuilderIndexer.addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + targetGenes = new ArrayList<>(); + currentGene = geneName; + currentMiRTarBaseId = miRTarBaseId; + currentMiRNA = miRNA; + } + + // #6: Experiments + cell = row.getCell(6); + String experiment = (cell.getCellType() == CellType.STRING ? cell.getStringCellValue() : null); + + // #7: Support Type + cell = row.getCell(7); + String supportType = (cell.getCellType() == CellType.STRING ? cell.getStringCellValue() : null); + + // #8: pubmed + cell = row.getCell(8); + String pubmed = new BigDecimal(cell.getNumericCellValue()).toString(); + + if (StringUtils.isNotEmpty(experiment) || StringUtils.isNotEmpty(supportType) || StringUtils.isNotEmpty(pubmed)) { + targetGenes.add(new TargetGene(experiment, supportType, pubmed)); + } + } + + // parse last entry + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + GeneBuilderIndexer.addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + + } + logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile); + + return geneToMirna; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 7b3b9f345b..42886606b7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -26,7 +26,6 @@ import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDBException; @@ -36,7 +35,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; -import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -147,27 +145,7 @@ public void check() throws Exception { // Check regulation files // mirtarbase - // The downloaded .xlsx file contains errors and it has to be fixed manually - logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); - Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); - List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( - getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) - .collect(Collectors.toList()); - if (mirTarBaseFiles.size() != 1) { - throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath - + ", but currently there are " + mirTarBaseFiles.size() + " files"); - } - // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually - if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { - throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " - + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); - } - miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); - if (!Files.exists(miRTarBaseFile)) { - throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist. You" - + " have to export the file " + mirTarBaseFiles.get(0) + " to " + miRTarBaseFile.getFileName() + " format separated by" - + " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh"); - } + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); checked = true; diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexerTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexerTest.java new file mode 100644 index 0000000000..fabf46f6d7 --- /dev/null +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexerTest.java @@ -0,0 +1,65 @@ +package org.opencb.cellbase.lib.builders; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.opencb.biodata.models.core.MirnaTarget; +import org.opencb.biodata.models.core.TargetGene; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.fail; + +public class GeneBuilderIndexerTest { + + @Test + public void testPareMirTarFile() throws IOException { + + Path mirtarbasePath = Paths.get(this.getClass().getClassLoader().getResource("regulation/hsa_MTI.xlsx").getPath()); + MiRTarBaseIndexer indexer = new MiRTarBaseIndexer(); + Map> result = indexer.index(mirtarbasePath); + + Assertions.assertEquals(5, result.size()); + + List> pairs = Arrays.asList(new ImmutablePair<>("WASH7P", "MIRT000002"), + new ImmutablePair<>("CXCR4", "MIRT000006"), + new ImmutablePair<>("CYP7A1", "MIRT000012"), + new ImmutablePair<>("STAT5A", "MIRT000018"), + new ImmutablePair<>("RASGRP1", "MIRT000019")); + + + for (Pair pair : pairs) { + Assertions.assertTrue(result.containsKey(pair.getKey())); + Assertions.assertEquals(pair.getValue(), result.get(pair.getKey()).get(0).getId()); + } + + // MIRT000018 hsa-miR-222-3p Homo sapiens STAT5A 6776 Homo sapiens qRT-PCR//Luciferase reporter assay//Western blot Functional MTI 20489169 + // MIRT000018 hsa-miR-222-3p Homo sapiens STAT5A 6776 Homo sapiens Luciferase reporter assay Functional MTI 24736554 + Assertions.assertEquals(1, result.get("STAT5A").size()); + Assertions.assertEquals("hsa-miR-222-3p", result.get("STAT5A").get(0).getSourceId()); + Assertions.assertEquals(2, result.get("STAT5A").get(0).getTargets().size()); + for (TargetGene target : result.get("STAT5A").get(0).getTargets()) { + switch (target.getPubmed()) { + case "20489169": { + Assertions.assertEquals("Functional MTI", target.getEvidence()); + Assertions.assertEquals("qRT-PCR//Luciferase reporter assay//Western blot", target.getExperiment()); + break; + } + case "24736554": { + Assertions.assertEquals("Functional MTI", target.getEvidence()); + Assertions.assertEquals("Luciferase reporter assay", target.getExperiment()); + break; + } + default: { + fail(); + } + } + } + } +} \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index 2f7da1b6b7..f5c08b498a 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -156,8 +156,6 @@ download: host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" files: - # This file contains errors and has to be fixed before building - # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx From 10a579a042c95db51e08ca77e940fc31c3515c22 Mon Sep 17 00:00:00 2001 From: imedina Date: Thu, 4 Jul 2024 16:35:02 +0100 Subject: [PATCH 084/148] Builder improvements and several data cleaning --- ...embl-canonical.pl => ensembl_canonical.pl} | 7 +- .../scripts/ensembl-scripts/genome_info.pl | 26 +- .../cellbase/app/cli/CliOptionsParser.java | 4 +- .../cellbase/app/cli/CommandExecutor.java | 67 +--- .../app/cli/admin/AdminCliOptionsParser.java | 10 +- .../admin/executors/BuildCommandExecutor.java | 222 ++++++------- .../executors/DownloadCommandExecutor.java | 44 +-- .../src/main/resources/configuration.yml | 17 +- ...lBaseBuilder.java => AbstractBuilder.java} | 4 +- .../builders/CaddAllAnnotationBuilder.java | 2 +- .../lib/builders/CaddScoreBuilder.java | 2 +- .../lib/builders/ConservationBuilder.java | 2 +- .../lib/builders/EnsemblGeneBuilder.java | 47 +-- .../builders/EnsemblGeneBuilderIndexer.java | 10 +- .../cellbase/lib/builders/GeneBuilder.java | 5 +- .../lib/builders/GeneBuilderIndexer.java | 126 ++++---- .../lib/builders/GeneBuilderUtils.java | 291 ------------------ .../builders/GeneExpressionAtlasBuilder.java | 2 +- .../builders/GenomeSequenceFastaBuilder.java | 2 +- .../lib/builders/InteractionBuilder.java | 2 +- .../lib/builders/OntologyBuilder.java | 2 +- .../lib/builders/PharmGKBBuilder.java | 2 +- .../cellbase/lib/builders/ProteinBuilder.java | 2 +- .../cellbase/lib/builders/PubMedBuilder.java | 2 +- .../lib/builders/RefSeqGeneBuilder.java | 28 +- .../builders/RefSeqGeneBuilderIndexer.java | 8 +- .../builders/RegulatoryFeatureBuilder.java | 2 +- .../cellbase/lib/builders/RepeatsBuilder.java | 2 +- .../lib/builders/RevelScoreBuilder.java | 2 +- .../cellbase/lib/builders/SpliceBuilder.java | 2 +- .../clinical/variant/ClinVarParser.java | 4 +- .../variant/ClinicalVariantBuilder.java | 4 +- .../clinical/variant/CosmicBuilder.java | 4 +- .../cellbase/lib/download/Downloader.java | 104 ------- .../lib/download/GeneDownloadManager.java | 30 ++ .../lib/download/GenomeDownloadManager.java | 2 +- 36 files changed, 326 insertions(+), 766 deletions(-) rename cellbase-app/app/scripts/ensembl-scripts/{ensembl-canonical.pl => ensembl_canonical.pl} (90%) rename cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/{CellBaseBuilder.java => AbstractBuilder.java} (98%) delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java diff --git a/cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl similarity index 90% rename from cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl rename to cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl index 5066ccd3ad..9be361a55d 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/ensembl-canonical.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl @@ -1,8 +1,11 @@ #!/usr/bin/env perl -# An example script demonstrating the use of BioMart API. -# This perl API representation is only available for configuration versions >= 0.5 use strict; +use Getopt::Long; +use Data::Dumper; +use JSON; +use DB_CONFIG; + use BioMart::Initializer; use BioMart::Query; use BioMart::QueryRunner; diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl index e5ecd61c33..f229fdb47d 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl @@ -31,7 +31,6 @@ if ($outfile eq "") { $outfile = "/ensembl-data/genome_info.json"; - # $outfile = "/ensembl-data/$species.json"; } #################################################################### @@ -44,17 +43,13 @@ # Bio::EnsEMBL::Registry->load_all("$ENSEMBL_REGISTRY"); if($phylo eq "" || $phylo eq "vertebrate") { print ("In vertebrates section\n"); - if ($species eq "Homo sapiens" && $assembly eq "GRCh38") { - print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n"); - Bio::EnsEMBL::Registry->load_registry_from_db( - -host => $ENSEMBL_HOST, - -user => $ENSEMBL_USER, - -port => $ENSEMBL_PORT, - -verbose => $verbose - ); - } else { - print ("Human selected, assembly ".$assembly." no supported\n"); - } + print ("Species: ".$species.", assembly ".$assembly.", connecting to: ".$ENSEMBL_HOST.":".$ENSEMBL_PORT."\n"); + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => $ENSEMBL_HOST, + -user => $ENSEMBL_USER, + -port => $ENSEMBL_PORT, + -verbose => $verbose + ); } else { print ("In no-vertebrates section\n"); Bio::EnsEMBL::Registry->load_registry_from_db( @@ -66,7 +61,6 @@ my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Slice"); my $karyotype_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "KaryotypeBand"); -# my $gene_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Gene"); #################################################################### my %info_stats = (); @@ -83,12 +77,10 @@ $chromosome{'start'} = int($chrom->start()); $chromosome{'end'} = int($chrom->end()); $chromosome{'size'} = int($chrom->seq_region_length()); -# $chromosome{'numberGenes'} = scalar @{$chrom->get_all_Genes()}; $chromosome{'isCircular'} = $chrom->is_circular(); my @cytobands = (); foreach my $cyto(@{$karyotype_adaptor->fetch_all_by_chr_name($chrom->seq_region_name)}) { -# print $cytoband->name."\n"; my %cytoband = (); $cytoband{'name'} = $cyto->name(); $cytoband{'start'} = int($cyto->start()); @@ -98,7 +90,7 @@ push(@cytobands, \%cytoband); } - ## check if any cytoband has been added + ## Check if any cytoband has been added ## If not a unique cytoband covering all chromosome is added. if(@cytobands == 0) { my %cytoband = (); @@ -112,7 +104,6 @@ $chromosome{'cytobands'} = \@cytobands; push(@chromosomes, \%chromosome); -# push(@chrom_ids, $chrom->seq_region_name); } $info_stats{'chromosomes'} = \@chromosomes; @@ -126,7 +117,6 @@ $supercontig{'start'} = int($supercon->start()); $supercontig{'end'} = int($supercon->end()); $supercontig{'size'} = int($supercon->seq_region_length()); -# $supercontig{'numberGenes'} = scalar @{$supercon->get_all_Genes()}; $supercontig{'isCircular'} = $supercon->is_circular(); ## Adding an unique cytoband covering all chromosome is added. diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java index 088db087f0..a71663f19f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java @@ -66,8 +66,8 @@ public class CommonCommandOptions { description = "Set the logging level, accepted values are: debug, info, warn, error and fatal") public String logLevel = "info"; - @Parameter(names = {"-C", "--config"}, arity = 1, - description = "Path to CellBase configuration.yml file") + @Deprecated + @Parameter(names = {"-C", "--config"}, arity = 1, hidden = true, description = "Path to CellBase configuration.yml file") public String conf; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java index 39018bf170..64dcc05bfb 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java @@ -35,18 +35,12 @@ import java.nio.file.Path; import java.nio.file.Paths; -/** - * Created by imedina on 03/02/15. - */ + public abstract class CommandExecutor { protected String logLevel; -// protected boolean verbose; protected String conf; - @Deprecated - protected String configFile; - protected String appHome; protected CellBaseConfiguration configuration; @@ -55,35 +49,13 @@ public abstract class CommandExecutor { protected Logger logger; public CommandExecutor() { - } public CommandExecutor(String logLevel, String conf) { this.logLevel = logLevel; this.conf = conf; - /** - * System property 'app.home' is set up by cellbase.sh. If by any reason this is null - * then CELLBASE_HOME environment variable is used instead. - */ - this.appHome = System.getProperty("app.home", System.getenv("CELLBASE_HOME")); - - if (StringUtils.isEmpty(conf)) { - this.conf = this.appHome + "/conf"; - } - - if (logLevel != null && !logLevel.isEmpty()) { - // We must call to this method - setLogLevel(logLevel); - } - } - - public CommandExecutor(String logLevel, boolean verbose, String conf) { - this.logLevel = logLevel; -// this.verbose = verbose; - this.conf = conf; - - /** + /* * System property 'app.home' is set up by cellbase.sh. If by any reason this is null * then CELLBASE_HOME environment variable is used instead. */ @@ -124,29 +96,16 @@ public void setLogLevel(String logLevel) { this.logLevel = logLevel; } -// public boolean isVerbose() { -// return verbose; -// } -// -// public void setVerbose(boolean verbose) { -// this.verbose = verbose; -// } - - public String getConfigFile() { - return configFile; - } - - public void setConfigFile(String configFile) { - this.configFile = configFile; - } - public Logger getLogger() { return logger; } - /* + /** * This method attempts to first data configuration from CLI parameter, if not present then uses * the configuration from installation directory, if not exists then loads JAR configuration.json or yml. + * + * @throws URISyntaxException If any URI problem occurs + * @throws IOException If any IO problem occurs */ public void loadCellBaseConfiguration() throws URISyntaxException, IOException { Path confPath = Paths.get(this.conf); @@ -154,11 +113,13 @@ public void loadCellBaseConfiguration() throws URISyntaxException, IOException { if (Files.exists(confPath.resolve("configuration.json"))) { logger.debug("Loading configuration from '{}'", confPath.resolve("configuration.json").toAbsolutePath()); - this.configuration = CellBaseConfiguration.load(new FileInputStream(confPath.resolve("configuration.json").toFile()), - CellBaseConfiguration.ConfigurationFileFormat.JSON); + this.configuration = CellBaseConfiguration + .load(Files.newInputStream(confPath.resolve("configuration.json").toFile().toPath()), + CellBaseConfiguration.ConfigurationFileFormat.JSON); } else if (Files.exists(Paths.get(this.appHome + "/conf/configuration.yml"))) { logger.debug("Loading configuration from '{}'", this.appHome + "/conf/configuration.yml"); - this.configuration = CellBaseConfiguration.load(new FileInputStream(new File(this.appHome + "/conf/configuration.yml"))); + this.configuration = CellBaseConfiguration + .load(Files.newInputStream(new File(this.appHome + "/conf/configuration.yml").toPath())); } else { InputStream inputStream = CellBaseConfiguration.class.getClassLoader().getResourceAsStream("conf/configuration.json"); String configurationFilePath = "conf/configuration.json"; @@ -198,10 +159,4 @@ public void loadClientConfiguration() throws IOException { } } } - - protected void makeDir(Path folderPath) throws IOException { - if (!Files.exists(folderPath)) { - Files.createDirectories(folderPath); - } - } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index ebf647f91b..55a446c4ea 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -86,8 +86,8 @@ public class DownloadCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA - + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA - + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) + + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, @@ -101,9 +101,9 @@ public class BuildCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + "," - + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," - + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to build everything", required = true, arity = 1) public String data; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index c3853c4478..05d5de191b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -43,25 +43,23 @@ import java.util.Collections; import java.util.List; -import static org.opencb.cellbase.core.utils.SpeciesUtils.getSpeciesShortname; import static org.opencb.cellbase.lib.EtlCommons.*; -/** - * Created by imedina on 03/02/15. - */ + public class BuildCommandExecutor extends CommandExecutor { - private AdminCliOptionsParser.BuildCommandOptions buildCommandOptions; - private Path output; - private Path buildFolder = null; // /_/generated-json - private Path downloadFolder = null; // /_/download + private final AdminCliOptionsParser.BuildCommandOptions buildCommandOptions; + private final Path outputDirectory; + + private Path buildFolder = null; + private Path downloadFolder = null; private boolean normalize = true; + private SpeciesConfiguration speciesConfiguration; private SpeciesConfiguration.Assembly assembly; private String ensemblRelease; private boolean flexibleGTFParsing; - private SpeciesConfiguration speciesConfiguration; private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, @@ -71,7 +69,7 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); this.buildCommandOptions = buildCommandOptions; - this.output = Paths.get(buildCommandOptions.outputDirectory); + this.outputDirectory = Paths.get(buildCommandOptions.outputDirectory); normalize = !buildCommandOptions.skipNormalize; this.flexibleGTFParsing = buildCommandOptions.flexibleGTFParsing; @@ -83,22 +81,21 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma * @throws CellBaseException Exception */ public void execute() throws CellBaseException { - String data = null; try { - // Check data sources - List dataList = checkDataSources(); - // Output directory need to be created if it doesn't exist - if (!Files.exists(output)) { - Files.createDirectories(output); + if (!Files.exists(outputDirectory)) { + Files.createDirectories(outputDirectory); } - speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, buildCommandOptions.species); + // Get the species + String species = buildCommandOptions.species; + speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); if (speciesConfiguration == null) { throw new CellBaseException("Invalid species: '" + buildCommandOptions.species + "'"); } - if (!StringUtils.isEmpty(buildCommandOptions.assembly)) { + // Get the assembly + if (StringUtils.isNotEmpty(buildCommandOptions.assembly)) { assembly = SpeciesUtils.getAssembly(speciesConfiguration, buildCommandOptions.assembly); if (assembly == null) { throw new CellBaseException("Invalid assembly: '" + buildCommandOptions.assembly + "'"); @@ -110,57 +107,58 @@ public void execute() throws CellBaseException { String ensemblVersion = assembly.getEnsemblVersion(); ensemblRelease = "release-" + ensemblVersion.split("_")[0]; - String spShortName = getSpeciesShortname(speciesConfiguration); + String spShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); String spAssembly = assembly.getName().toLowerCase(); - Path spFolder = output.resolve(spShortName + "_" + spAssembly); - // /_/download - downloadFolder = output.resolve(spFolder + "/download"); + Path spFolder = outputDirectory.resolve(spShortName + "_" + spAssembly); + downloadFolder = outputDirectory.resolve(spFolder + "/download"); if (!Files.exists(downloadFolder)) { throw new CellBaseException("Download folder not found '" + spShortName + "_" + spAssembly + "/download'"); } - // /_/generated_json - buildFolder = output.resolve(spFolder + "/generated_json"); + buildFolder = outputDirectory.resolve(spFolder + "/generated_json"); if (!buildFolder.toFile().exists()) { - makeDir(buildFolder); + if (!Files.exists(buildFolder)) { + Files.createDirectories(buildFolder); + } } - CellBaseBuilder parser; - for (int i = 0; i < dataList.size(); i++) { - data = dataList.get(i); + // Check data sources + List dataList = getDataList(species, speciesConfiguration); + AbstractBuilder parser; + for (String data : dataList) { switch (data) { case GENOME_DATA: parser = buildGenomeSequence(); break; + case CONSERVATION_DATA: + parser = buildConservation(); + break; + case REPEATS_DATA: + parser = buildRepeats(); + break; case GENE_DATA: parser = buildGene(); break; - case VARIATION_FUNCTIONAL_SCORE_DATA: - parser = buildCadd(); - break; - case MISSENSE_VARIATION_SCORE_DATA: - parser = buildRevel(); + case PROTEIN_DATA: + parser = buildProtein(); break; case REGULATION_DATA: parser = buildRegulation(); break; - case PROTEIN_DATA: - parser = buildProtein(); + case VARIATION_FUNCTIONAL_SCORE_DATA: + parser = buildCadd(); break; - case CONSERVATION_DATA: - parser = buildConservation(); + case MISSENSE_VARIATION_SCORE_DATA: + parser = buildRevel(); break; case CLINICAL_VARIANT_DATA: parser = buildClinicalVariants(); break; - case REPEATS_DATA: - parser = buildRepeats(); + case SPLICE_SCORE_DATA: + parser = buildSplice(); break; case ONTOLOGY_DATA: parser = buildObo(); break; - case SPLICE_SCORE_DATA: - parser = buildSplice(); - break; case PUBMED_DATA: parser = buildPubMed(); break; @@ -168,26 +166,42 @@ public void execute() throws CellBaseException { parser = buildPharmacogenomics(); break; default: - throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter." - + " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build" - + " everything"); + throw new IllegalArgumentException("Data parameter '" + data + "' is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfiguration.getData(), ",") + + ". You can use data parameter 'all' to download everything"); } - if (parser != null) { - parser.parse(); - parser.disconnect(); - } + parser.parse(); + parser.disconnect(); } + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing command line 'build': " + e.getMessage(), e); } catch (Exception e) { - String msg = "Error executing the command 'build'"; - if (StringUtils.isNotEmpty(data)) { - msg += ". The last data being built was '" + data + "'"; - } - throw new CellBaseException(msg + ": " + e.getMessage(), e); + throw new CellBaseException("Error executing command line 'build': " + e.getMessage(), e); } } - private CellBaseBuilder buildRepeats() throws CellBaseException { + private AbstractBuilder buildGenomeSequence() throws CellBaseException { + // Sanity check + Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA)); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); + + // Get FASTA path + Path fastaPath = getFastaReferenceGenome(); + + // Create serializer and return the genome builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA); + return new GenomeSequenceFastaBuilder(fastaPath, serializer); + } + + private AbstractBuilder buildGene() throws CellBaseException { + return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing, + configuration); + } + + private AbstractBuilder buildRepeats() throws CellBaseException { // Sanity check Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(getDataVersionFilename(TRF_DATA)), @@ -200,7 +214,7 @@ private CellBaseBuilder buildRepeats() throws CellBaseException { return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } - private CellBaseBuilder buildObo() throws CellBaseException { + private AbstractBuilder buildObo() throws CellBaseException { Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_DATA); Path oboBuildPath = buildFolder.resolve(ONTOLOGY_DATA); List versionPaths = Arrays.asList(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), @@ -214,39 +228,7 @@ private CellBaseBuilder buildObo() throws CellBaseException { return new OntologyBuilder(oboDownloadPath, serializer); } - /** - * @deprecated (when using the new copyVersionFiles) - */ - @Deprecated - private void copyVersionFiles(List pathList) { - for (Path path : pathList) { - try { - Files.copy(path, downloadFolder.resolve(path.getFileName()), StandardCopyOption.REPLACE_EXISTING); - } catch (IOException e) { - logger.warn("Version file {} not found - skipping", path); - } - } - } - - private CellBaseBuilder buildGenomeSequence() throws CellBaseException { - // Sanity check - Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA)); - copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); - - // Get FASTA path - Path fastaPath = getFastaReferenceGenome(); - - // Create serializer and return the genome builder - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA); - return new GenomeSequenceFastaBuilder(fastaPath, serializer); - } - - private CellBaseBuilder buildGene() throws CellBaseException { - return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing, - configuration); - } - - private CellBaseBuilder buildCadd() throws CellBaseException { + private AbstractBuilder buildCadd() throws CellBaseException { // Sanity check Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); Path caddBuildPath = buildFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); @@ -257,7 +239,7 @@ private CellBaseBuilder buildCadd() throws CellBaseException { return new CaddScoreBuilder(caddDownloadPath, serializer); } - private CellBaseBuilder buildRevel() throws CellBaseException { + private AbstractBuilder buildRevel() throws CellBaseException { // Sanity check Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); Path revelBuildPath = buildFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); @@ -268,7 +250,7 @@ private CellBaseBuilder buildRevel() throws CellBaseException { return new RevelScoreBuilder(revelDownloadPath, serializer); } - private CellBaseBuilder buildRegulation() throws CellBaseException { + private AbstractBuilder buildRegulation() throws CellBaseException { // Sanity check Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); @@ -280,7 +262,7 @@ private CellBaseBuilder buildRegulation() throws CellBaseException { return new RegulatoryFeatureBuilder(regulationDownloadPath, serializer); } - private CellBaseBuilder buildProtein() throws CellBaseException { + private AbstractBuilder buildProtein() throws CellBaseException { // Sanity check Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); @@ -292,7 +274,7 @@ private CellBaseBuilder buildProtein() throws CellBaseException { return new ProteinBuilder(proteinDownloadPath, speciesConfiguration.getScientificName(), serializer); } - private CellBaseBuilder buildConservation() throws CellBaseException { + private AbstractBuilder buildConservation() throws CellBaseException { // Sanity check Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); @@ -305,7 +287,7 @@ private CellBaseBuilder buildConservation() throws CellBaseException { return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } - private CellBaseBuilder buildClinicalVariants() throws CellBaseException { + private AbstractBuilder buildClinicalVariants() throws CellBaseException { // Sanity check Path clinicalDownloadPath = downloadFolder.resolve(CLINICAL_VARIANT_DATA); Path clinicalBuildPath = buildFolder.resolve(CLINICAL_VARIANT_DATA); @@ -335,7 +317,7 @@ private String getDefaultHumanAssembly() { private Path getFastaReferenceGenome() throws CellBaseException { // Check FASTA and unzip if necessary String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, - getSpeciesShortname(speciesConfiguration), assembly.getName(), null); + SpeciesUtils.getSpeciesShortname(speciesConfiguration), assembly.getName(), null); String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); if (fastaPath.toFile().exists()) { @@ -358,7 +340,7 @@ private Path getFastaReferenceGenome() throws CellBaseException { return fastaPath; } - private CellBaseBuilder buildSplice() throws IOException, CellBaseException { + private AbstractBuilder buildSplice() throws IOException, CellBaseException { Path spliceInputFolder = downloadFolder.resolve(EtlCommons.SPLICE_SCORE_DATA); Path spliceOutputFolder = buildFolder.resolve(EtlCommons.SPLICE_SCORE_DATA); if (!spliceOutputFolder.toFile().exists()) { @@ -375,7 +357,7 @@ private CellBaseBuilder buildSplice() throws IOException, CellBaseException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws CellBaseException { + private AbstractBuilder buildPubMed() throws CellBaseException { // Sanity check Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); @@ -386,11 +368,11 @@ private CellBaseBuilder buildPubMed() throws CellBaseException { return new PubMedBuilder(pubMedDownloadPath, serializer, configuration); } - private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { + private AbstractBuilder buildPharmacogenomics() throws CellBaseException { // Sanity check Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); - copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA))), pharmGkbBuildPath); + copyVersionFiles(Collections.singletonList(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA))), pharmGkbBuildPath); // Create the file serializer and the PharmGKB feature builder CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); @@ -439,34 +421,24 @@ private void copyVersionFiles(List versionPaths, Path targetPath) throws C } } - private List checkDataSources() { - if (StringUtils.isEmpty(buildCommandOptions.data)) { - throw new IllegalArgumentException("Missing data parameter. Valid values are: " - + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to download everything"); - } - List dataList = Arrays.asList(buildCommandOptions.data.split(",")); - for (String data : dataList) { - switch (data) { - case GENOME_DATA: - case GENE_DATA: - case REFSEQ_DATA: - case VARIATION_FUNCTIONAL_SCORE_DATA: - case MISSENSE_VARIATION_SCORE_DATA: - case REGULATION_DATA: - case PROTEIN_DATA: - case CONSERVATION_DATA: - case CLINICAL_VARIANT_DATA: - case REPEATS_DATA: - case ONTOLOGY_DATA: - case SPLICE_SCORE_DATA: - case PUBMED_DATA: - case PHARMACOGENOMICS_DATA: - break; - default: - throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " - + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build everything"); + private List getDataList(String species, SpeciesConfiguration speciesConfig) throws CellBaseException { + // No need to check if 'data' exists since it is declared as required in JCommander + List dataList; + if ("all".equalsIgnoreCase(buildCommandOptions.data)) { + // Download all data sources for the species in the configuration.yml file + dataList = speciesConfig.getData(); + } else { + // Check if the data sources requested are valid for the species + dataList = Arrays.asList(buildCommandOptions.data.split(",")); + for (String data : dataList) { + if (!speciesConfig.getData().contains(data)) { + throw new CellBaseException("Data parameter '" + data + "' does not exist or it is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + ". " + + "You can use data parameter 'all' to build everything"); + } } } return dataList; } + } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 8718bb29a9..6c1f4194d5 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -22,9 +22,7 @@ import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.opencb.cellbase.lib.download.AbstractDownloadManager; -import org.opencb.cellbase.lib.download.DownloadFile; -import org.opencb.cellbase.lib.download.Downloader; +import org.opencb.cellbase.lib.download.*; import java.nio.file.Path; import java.nio.file.Paths; @@ -59,58 +57,64 @@ public void execute() throws CellBaseException { String assembly = downloadCommandOptions.speciesAndAssemblyOptions.assembly; // Get the valid list of data sources - SpeciesConfiguration speciesConfig = SpeciesUtils.getSpeciesConfiguration(configuration, species); - List dataList = getDataList(species, speciesConfig); + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); + if (speciesConfiguration == null) { + throw new CellBaseException("Invalid species: '" + downloadCommandOptions.speciesAndAssemblyOptions.species + "'"); + } + List dataList = getDataList(species, speciesConfiguration); logger.info("Downloading the following data sources: {}", StringUtils.join(dataList, ",")); List downloadFiles = new ArrayList<>(); - Downloader downloader = new Downloader(species, assembly, outputDirectory, configuration); + AbstractDownloadManager downloader; for (String data : dataList) { switch (data) { case GENOME_DATA: - downloadFiles.addAll(downloader.downloadGenome()); + downloader = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); break; case CONSERVATION_DATA: - downloadFiles.addAll(downloader.downloadConservation()); + downloader = new ConservationDownloadManager(species, assembly, outputDirectory, configuration); break; case REPEATS_DATA: - downloadFiles.addAll(downloader.downloadRepeats()); + downloader = new RepeatsDownloadManager(species, assembly, outputDirectory, configuration); break; case GENE_DATA: - downloadFiles.addAll(downloader.downloadGene()); + downloader = new GeneDownloadManager(species, assembly, outputDirectory, configuration); break; case PROTEIN_DATA: - downloadFiles.addAll(downloader.downloadProtein()); + downloader = new ProteinDownloadManager(species, assembly, outputDirectory, configuration); break; case REGULATION_DATA: - downloadFiles.addAll(downloader.downloadRegulation()); + downloader = new RegulationDownloadManager(species, assembly, outputDirectory, configuration); break; case VARIATION_FUNCTIONAL_SCORE_DATA: - downloadFiles.addAll(downloader.downloadCaddScores()); + downloader = new CaddDownloadManager(species, assembly, outputDirectory, configuration); break; case MISSENSE_VARIATION_SCORE_DATA: - downloadFiles.addAll(downloader.downloadPredictionScores()); + downloader = new MissenseScoresDownloadManager(species, assembly, outputDirectory, configuration); break; case CLINICAL_VARIANT_DATA: - downloadFiles.addAll(downloader.downloadClinicalVariants()); + downloader = new ClinicalDownloadManager(species, assembly, outputDirectory, configuration); break; case SPLICE_SCORE_DATA: - downloadFiles.addAll(downloader.downloadSpliceScores()); + downloader = new SpliceScoreDownloadManager(species, assembly, outputDirectory, configuration); break; case ONTOLOGY_DATA: - downloadFiles.addAll(downloader.downloadOntologies()); + downloader = new OntologyDownloadManager(species, assembly, outputDirectory, configuration); break; case PUBMED_DATA: - downloadFiles.addAll(downloader.downloadPubMed()); + downloader = new PubMedDownloadManager(species, assembly, outputDirectory, configuration); break; case PHARMACOGENOMICS_DATA: - downloadFiles.addAll(downloader.downloadPharmKGB()); + downloader = new PharmGKBDownloadManager(species, assembly, outputDirectory, configuration); break; default: throw new IllegalArgumentException("Data parameter '" + data + "' is not allowed for '" + species + "'. " - + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + + "Valid values are: " + StringUtils.join(speciesConfiguration.getData(), ",") + ". You can use data parameter 'all' to download everything"); } + + // Call to download method and add the files to the list + downloadFiles.addAll(downloader.download()); } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); } catch (InterruptedException e) { diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 88d8d8a9fd..50973fa198 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -54,11 +54,11 @@ download: # To be generated manually XREFS: "manual@xrefs.txt" # To be downloaded manually - HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" +# HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" +# # To be downloaded manually +# TSO500: "manual@TSO500_transcripts.txt" # To be downloaded manually - TSO500: "manual@TSO500_transcripts.txt" - # To be downloaded manually - CANONICAL: "manual@ensembl_canonical.txt" +# CANONICAL: "manual@ensembl_canonical.txt" ensemblGenomes: database: @@ -336,13 +336,14 @@ species: - conservation - repeats - gene - - regulation - protein - - clinical_variant - - missense_variation_functional_score - - ontology + - regulation - variation_functional_score + - missense_variation_functional_score + - clinical_variant - splice_score + - ontology + - pubmed - pharmacogenomics - id: mmusculus scientificName: Mus musculus diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java similarity index 98% rename from cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java rename to cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java index 4056dd18b4..85a04e2f8f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -40,7 +40,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public abstract class CellBaseBuilder { +public abstract class AbstractBuilder { protected CellBaseSerializer serializer; protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); @@ -61,7 +61,7 @@ public abstract class CellBaseBuilder { public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done."; - public CellBaseBuilder(CellBaseSerializer serializer) { + public AbstractBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); this.serializer = serializer; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java index b96985c399..7dd8b6a5bd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java @@ -29,7 +29,7 @@ * @since October 08, 2014 */ @Deprecated -public class CaddAllAnnotationBuilder extends CellBaseBuilder { +public class CaddAllAnnotationBuilder extends AbstractBuilder { private final Path caddFilePath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index d0597c4c2a..64e4dda059 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -34,7 +34,7 @@ /** * Created by imedina on 06/11/15. */ -public class CaddScoreBuilder extends CellBaseBuilder { +public class CaddScoreBuilder extends AbstractBuilder { private Path caddDownloadPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index d43c38cb7a..9f2ae630f9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -34,7 +34,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class ConservationBuilder extends CellBaseBuilder { +public class ConservationBuilder extends AbstractBuilder { private Path conservedRegionPath; private int chunkSize; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index e8ea728da3..6e76bc8d62 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -44,15 +44,15 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class EnsemblGeneBuilder extends CellBaseBuilder { +public class EnsemblGeneBuilder extends AbstractBuilder { private Path downloadPath; private SpeciesConfiguration speciesConfiguration; private boolean flexibleGTFParsing; private CellBaseConfiguration configuration; - private Map transcriptDict; - private Map exonDict; + private final Map transcriptDict; + private final Map exonDict; private Path gtfFile; private Path proteinFastaFile; @@ -77,8 +77,8 @@ public class EnsemblGeneBuilder extends CellBaseBuilder { private Path cancerGeneCensusFile; private Path cancerHostpotFile; private Path ensemblCanonicalFile; - private Path tso500File; - private Path eglhHaemOncFile; +// private Path tso500File; +// private Path eglhHaemOncFile; // source for genes is either ensembl or refseq private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); @@ -136,8 +136,8 @@ public void check() throws Exception { ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); - eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); +// tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); +// eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); @@ -228,14 +228,12 @@ public void parse() throws Exception { int cds = 1; EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); - try { // process files and put values in rocksdb indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, - tso500File, eglhHaemOncFile); + miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile); TabixReader tabixReader = null; if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { @@ -245,7 +243,6 @@ public void parse() throws Exception { } // Preparing the fasta file for fast accessing -// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); // Empty transcript and exon dictionaries @@ -493,6 +490,7 @@ private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, T if (StringUtils.isNotEmpty(tags)) { transcript.getFlags().addAll(Arrays.asList(tags.split(","))); } + // 2. TSL String supportLevel = gtfAttributes.get("transcript_support_level"); if (StringUtils.isNotEmpty(supportLevel)) { @@ -500,11 +498,13 @@ private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, T String truncatedSupportLevel = supportLevel.split(" ")[0]; transcript.getFlags().add("TSL:" + truncatedSupportLevel); } + // 3. MANE Flag String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); if (StringUtils.isNotEmpty(maneFlag)) { transcript.getFlags().add(maneFlag); } + // 4. LRG Flag String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); if (StringUtils.isNotEmpty(lrg)) { @@ -516,6 +516,7 @@ private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, T } } } + // 5. Ensembl Canonical String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); if (StringUtils.isNotEmpty(canonicalFlag)) { @@ -523,18 +524,18 @@ private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, T } // 6. TSO500 and EGLH HaemOnc - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(maneRefSeq)) { - String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } - } +// String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); +// if (StringUtils.isNotEmpty(maneRefSeq)) { +// String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); +// if (StringUtils.isNotEmpty(tso500Flag)) { +// transcript.getFlags().add(tso500Flag); +// } +// +// String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); +// if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { +// transcript.getFlags().add(eglhHaemOncFlag); +// } +// } gene.getTranscripts().add(transcript); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index 0b102d015c..d46ebef225 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -50,8 +50,8 @@ import java.util.zip.GZIPInputStream; import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; -import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; -import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer { @@ -72,7 +72,7 @@ public EnsemblGeneBuilderIndexer(Path geneDirectoryPath) { public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, Path lrgFile, Path uniprotIdMappingFile, Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, - Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile) + Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { indexDescriptions(geneDescriptionFile); indexXrefs(xrefsFile, uniprotIdMappingFile); @@ -91,8 +91,8 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); indexCanonical(canonicalFile); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index 43a654f2da..a5dda27e34 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -25,14 +25,13 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class GeneBuilder extends CellBaseBuilder { +public class GeneBuilder extends AbstractBuilder { private EnsemblGeneBuilder ensemblGeneBuilder; private RefSeqGeneBuilder refSeqGeneBuilder; public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseConfiguration configuration) - throws CellBaseException { + CellBaseConfiguration configuration) throws CellBaseException { super(null); // Create Ensembl gene builder diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index b8941cc448..c220d271d9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -44,8 +44,8 @@ import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; -import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; -import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; public class GeneBuilderIndexer { @@ -67,8 +67,8 @@ public class GeneBuilderIndexer { protected final String DRUGS_SUFFIX = "_drug"; protected final String DISEASE_SUFFIX = "_disease"; protected final String MIRTARBASE_SUFFIX = "_mirtarbase"; - protected final String TSO500_SUFFIX = "_tso500"; - protected final String EGLH_HAEMONC_SUFFIX = "_eglh_haemonc"; +// protected final String TSO500_SUFFIX = "_tso500"; +// protected final String EGLH_HAEMONC_SUFFIX = "_eglh_haemonc"; public GeneBuilderIndexer(Path genePath) { this.init(genePath); @@ -414,65 +414,65 @@ public List getCancerHotspot(String geneName) throws RocksDBExcep return rocksDbManager.getCancerHotspot(rocksdb, key); } - protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { - logger.info(PARSING_LOG_MESSAGE, tso500Path); - - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { - String line = bufferedReader.readLine(); - // Gene Ref Seq - // FAS NM_000043 - // AR NM_000044 - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); - } - } - line = bufferedReader.readLine(); - } - } - logger.info(PARSING_DONE_LOG_MESSAGE, tso500Path); - } - - public String getTSO500(String transcriptId) throws RocksDBException { - String key = transcriptId + TSO500_SUFFIX; - byte[] bytes = rocksdb.get(key.getBytes()); - if (bytes == null) { - return null; - } - return new String(bytes); - } - - protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { - logger.info(PARSING_LOG_MESSAGE, eglhHaemOncPath); - - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { - String line = bufferedReader.readLine(); - // Gene Ref Seq - // GNB1 NM_002074.4 - // CSF3R NM_000760.3 - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); - } - } - line = bufferedReader.readLine(); - } - } - logger.info(PARSING_DONE_LOG_MESSAGE, eglhHaemOncPath); - } - - public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { - String key = transcriptId + EGLH_HAEMONC_SUFFIX; - byte[] bytes = rocksdb.get(key.getBytes()); - if (bytes == null) { - return null; - } - return new String(bytes); - } +// protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { +// logger.info(PARSING_LOG_MESSAGE, tso500Path); +// +// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { +// String line = bufferedReader.readLine(); +// // Gene Ref Seq +// // FAS NM_000043 +// // AR NM_000044 +// while (StringUtils.isNotEmpty(line)) { +// if (!line.startsWith("#")) { +// String[] fields = line.split("\t", -1); +// if (fields.length == 2) { +// rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); +// } +// } +// line = bufferedReader.readLine(); +// } +// } +// logger.info(PARSING_DONE_LOG_MESSAGE, tso500Path); +// } +// +// public String getTSO500(String transcriptId) throws RocksDBException { +// String key = transcriptId + TSO500_SUFFIX; +// byte[] bytes = rocksdb.get(key.getBytes()); +// if (bytes == null) { +// return null; +// } +// return new String(bytes); +// } + +// protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { +// logger.info(PARSING_LOG_MESSAGE, eglhHaemOncPath); +// +// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { +// String line = bufferedReader.readLine(); +// // Gene Ref Seq +// // GNB1 NM_002074.4 +// // CSF3R NM_000760.3 +// while (StringUtils.isNotEmpty(line)) { +// if (!line.startsWith("#")) { +// String[] fields = line.split("\t", -1); +// if (fields.length == 2) { +// rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); +// } +// } +// line = bufferedReader.readLine(); +// } +// } +// logger.info(PARSING_DONE_LOG_MESSAGE, eglhHaemOncPath); +// } +// +// public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { +// String key = transcriptId + EGLH_HAEMONC_SUFFIX; +// byte[] bytes = rocksdb.get(key.getBytes()); +// if (bytes == null) { +// return null; +// } +// return new String(bytes); +// } private String getIndexEntry(String id, String suffix) throws RocksDBException { return getIndexEntry(id, suffix, ""); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java deleted file mode 100644 index 16dbbc9a3c..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Created by imedina on 12/11/15. - */ -@Deprecated -public class GeneBuilderUtils { - - private static Logger logger = LoggerFactory.getLogger(GeneBuilderUtils.class); - -// @Deprecated -// public static Map> getTfbsMap(Path tfbsFile) throws IOException, NoSuchMethodException, FileFormatException { -// Map> tfbsMap = new HashMap<>(); -// if (tfbsFile != null && Files.exists(tfbsFile) && !Files.isDirectory(tfbsFile) && Files.size(tfbsFile) > 0) { -// Gff2Reader motifsFeatureReader = new Gff2Reader(tfbsFile); -// Gff2 tfbsMotifFeature; -// while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { -// // we only want high quality data. See issue 466 -// if (!tfbsMotifFeature.getAttribute().contains("experimental_evidence")) { -// continue; -// } -// String chromosome = tfbsMotifFeature.getSequenceName().replaceFirst("chr", ""); -// SortedSet chromosomeTfbsSet = tfbsMap.get(chromosome); -// if (chromosomeTfbsSet == null) { -// chromosomeTfbsSet = new TreeSet<>((Comparator) (feature1, feature2) -> { -// // TODO: maybe this should be in TranscriptTfbs class, and equals method should be overriden too -// if (feature1.getStart() != feature2.getStart()) { -// return feature1.getStart() - feature2.getStart(); -// } else { -// return feature1.getAttribute().compareTo(feature2.getAttribute()); -// } -// }); -// tfbsMap.put(chromosome, chromosomeTfbsSet); -// } -// chromosomeTfbsSet.add(tfbsMotifFeature); -// } -// motifsFeatureReader.close(); -// } -// return tfbsMap; -// } - -// public static Map> getXrefMap(Path xrefsFile, Path uniprotIdMappingFile) throws IOException { -// Map> xrefMap = new HashMap<>(); -// logger.info("Loading xref data..."); -// String[] fields; -// if (xrefsFile != null && Files.exists(xrefsFile) && Files.size(xrefsFile) > 0) { -// List lines = Files.readAllLines(xrefsFile, Charset.forName("ISO-8859-1")); -// for (String line : lines) { -// fields = line.split("\t", -1); -// if (fields.length >= 4) { -// if (!xrefMap.containsKey(fields[0])) { -// xrefMap.put(fields[0], new ArrayList<>()); -// } -// xrefMap.get(fields[0]).add(new Xref(fields[1], fields[2], fields[3])); -// } -// } -// } else { -// logger.warn("Xrefs file " + xrefsFile + " not found"); -// logger.warn("Xref data not loaded"); -// } -// -// logger.info("Loading protein mapping into xref data..."); -// if (uniprotIdMappingFile != null && Files.exists(uniprotIdMappingFile) && Files.size(uniprotIdMappingFile) > 0) { -// BufferedReader br = FileUtils.newBufferedReader(uniprotIdMappingFile); -// String line; -// while ((line = br.readLine()) != null) { -// fields = line.split("\t", -1); -// if (fields.length >= 19 && fields[19].startsWith("ENST")) { -// String[] transcripts = fields[19].split("; "); -// for (String transcript : transcripts) { -// if (!xrefMap.containsKey(transcript)) { -// xrefMap.put(transcript, new ArrayList()); -// } -// xrefMap.get(transcript).add(new Xref(fields[0], "uniprotkb_acc", "UniProtKB ACC")); -// xrefMap.get(transcript).add(new Xref(fields[1], "uniprotkb_id", "UniProtKB ID")); -// } -// } -// } -// br.close(); -// } else { -// logger.warn("Uniprot if mapping file " + uniprotIdMappingFile + " not found"); -// logger.warn("Protein mapping into xref data not loaded"); -// } -// -// return xrefMap; -// } - -// public static Map> getGeneDrugMap(Path geneDrugFile) throws IOException { -// Map> geneDrugMap = new HashMap<>(); -// if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { -// logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); -// BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); -// -// // Skip header -// br.readLine(); -// -// int lineCounter = 1; -// String line; -// while ((line = br.readLine()) != null) { -// String[] parts = line.split("\t"); -// String geneName = parts[0]; -// -// String source = null; -// if (parts.length >= 4) { -// source = parts[3]; -// } -// -// String interactionType = null; -// if (parts.length >= 5) { -// interactionType = parts[4]; -// } -// -// String drugName = null; -// if (parts.length >= 8) { -// // if drug name column is empty, use drug claim name instead -// drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; -// } -// if (StringUtils.isEmpty(drugName)) { -// // no drug name -// continue; -// } -// -// String chemblId = null; -// if (parts.length >= 9) { -// chemblId = parts[8]; -// } -// -// List publications = new ArrayList<>(); -// if (parts.length >= 10 && parts[9] != null) { -// publications = Arrays.asList(parts[9].split(",")); -// } -// -// //addValueToMapElement(geneDrugMap, geneName, new GeneDrugInteraction(geneName, drugName, source, null, interactionType)); -// // TODO update model to add new attributes -// addValueToMapElement(geneDrugMap, geneName, new GeneDrugInteraction(geneName, drugName, source, null, null, -// interactionType, chemblId, publications)); -// lineCounter++; -// } -// -// br.close(); -// } else { -// logger.warn("Gene drug file " + geneDrugFile + " not found"); -// logger.warn("Ignoring " + geneDrugFile); -// } -// -// return geneDrugMap; -// } - - -// -// public static Map> getGeneDiseaseAssociationMap(Path hpoFilePath, Path disgenetFilePath) -// throws IOException { -// Map> geneDiseaseAssociationMap = new HashMap<>(50000); -// -// String line; -// if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { -// BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath); -// // skip first header line -// bufferedReader.readLine(); -// while ((line = bufferedReader.readLine()) != null) { -// String[] fields = line.split("\t"); -// String omimId = fields[6]; -// String geneSymbol = fields[3]; -// String hpoId = fields[0]; -// String diseaseName = fields[1]; -// GeneTraitAssociation disease = -// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); -// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); -// } -// bufferedReader.close(); -// } -// -// if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { -// BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath); -// // skip first header line -// bufferedReader.readLine(); -// while ((line = bufferedReader.readLine()) != null) { -// String[] fields = line.split("\t"); -// String diseaseId = fields[4]; -// String diseaseName = fields[5]; -// String score = fields[9]; -// String numberOfPubmeds = fields[13].trim(); -// String numberOfSNPs = fields[14]; -// String source = fields[15]; -// GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), -// Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet"); -// addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); -// } -// bufferedReader.close(); -// } -// -// return geneDiseaseAssociationMap; -// } -// -// /** -// * For a gnomad file, parse and return a map of transcript to constraints. -// * -// * @param gnomadFile gene annotation file path -// * @return map of transcript to constraints -// * @throws IOException if goa file can't be read -// */ -// public static Map> getConstraints(Path gnomadFile) throws IOException { -// Map> transcriptConstraints = new HashMap<>(); -// -// if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { -// logger.info("Loading OE scores from '{}'", gnomadFile); -//// BufferedReader br = FileUtils.newBufferedReader(gnomadFile); -// InputStream inputStream = Files.newInputStream(gnomadFile); -// BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(inputStream))); -// // Skip header. -// br.readLine(); -// String line; -// while ((line = br.readLine()) != null) { -// String[] parts = line.split("\t"); -// String transcriptIdentifier = parts[1]; -// String canonical = parts[2]; -// String oeMis = parts[5]; -// String oeSyn = parts[14]; -// String oeLof = parts[24]; -// String exacPLI = parts[70]; -// String exacLof = parts[73]; -// String geneIdentifier = parts[64]; -// -// List constraints = new ArrayList<>(); -// addConstraint(constraints, "oe_mis", oeMis); -// addConstraint(constraints, "oe_syn", oeSyn); -// addConstraint(constraints, "oe_lof", oeLof); -// addConstraint(constraints, "exac_pLI", exacPLI); -// addConstraint(constraints, "exac_oe_lof", exacLof); -// transcriptConstraints.put(transcriptIdentifier, constraints); -// -// if ("TRUE".equalsIgnoreCase(canonical)) { -// transcriptConstraints.put(geneIdentifier, constraints); -// } -// } -// br.close(); -// } -// return transcriptConstraints; -// } -// -// private static void addConstraint(List constraints, String name, String value) { -// Constraint constraint = new Constraint(); -// constraint.setMethod("pLoF"); -// constraint.setSource("gnomAD"); -// constraint.setName(name); -// try { -// constraint.setValue(Double.parseDouble(value)); -// } catch (NumberFormatException e) { -// // invalid number (e.g. NA), discard. -// return; -// } -// constraints.add(constraint); -// } -// -// /** -// * For a gene annotation file, parse and return a map of proteins to ontology annotation objects. -// * -// * @param goaFile gene annotation file path -// * @return map of proteins to ontology annotation objects. -// * @throws IOException if goa file can't be read -// */ -// public static Map> getOntologyAnnotations(Path goaFile) throws IOException { -// Map> annotations = new HashMap<>(); -// if (goaFile != null && Files.exists(goaFile) && Files.size(goaFile) > 0) { -// logger.info("Loading GO annotation from '{}'", goaFile); -// BufferedReader br = FileUtils.newBufferedReader(goaFile); -// GafParser parser = new GafParser(); -// annotations = parser.parseGaf(br); -// } -// return annotations; -// } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java index 7428cd5fbf..e5f6449051 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java @@ -31,7 +31,7 @@ /** * Created by antonior on 10/16/14. */ -public class GeneExpressionAtlasBuilder extends CellBaseBuilder { +public class GeneExpressionAtlasBuilder extends AbstractBuilder { private Path geneAtlasDirectoryPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java index 5bb232f5d2..17f3472b20 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java @@ -24,7 +24,7 @@ import java.io.IOException; import java.nio.file.Path; -public class GenomeSequenceFastaBuilder extends CellBaseBuilder { +public class GenomeSequenceFastaBuilder extends AbstractBuilder { private Path genomeReferenceFastaFile; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java index 5fcc68c206..5bc18dba17 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java @@ -33,7 +33,7 @@ * Time: 4:43 PM * To change this template use File | Settings | File Templates. */ -public class InteractionBuilder extends CellBaseBuilder { +public class InteractionBuilder extends AbstractBuilder { private final String species; private final Path psimiTabFile; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index b14d20b54c..34710bfe3a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -31,7 +31,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class OntologyBuilder extends CellBaseBuilder { +public class OntologyBuilder extends AbstractBuilder { private Path oboDownloadPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 1a0ba2e7d3..dc5bb32ee2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -39,7 +39,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class PharmGKBBuilder extends CellBaseBuilder { +public class PharmGKBBuilder extends AbstractBuilder { private final Path pharmGkbDownloadPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index d8246241e4..4beef32a99 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -44,7 +44,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class ProteinBuilder extends CellBaseBuilder { +public class ProteinBuilder extends AbstractBuilder { private Path proteinPath; private String species; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java index 348d22a07d..5443b9aea9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java @@ -32,7 +32,7 @@ import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; import static org.opencb.cellbase.lib.EtlCommons.getDataName; -public class PubMedBuilder extends CellBaseBuilder { +public class PubMedBuilder extends AbstractBuilder { private Path pubMedDownloadPath; private CellBaseConfiguration configuration; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 7b3b9f345b..b6a4545215 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -40,7 +40,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class RefSeqGeneBuilder extends CellBaseBuilder { +public class RefSeqGeneBuilder extends AbstractBuilder { private Path downloadPath; private CellBaseConfiguration configuration; @@ -59,8 +59,8 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { private Path miRTarBaseFile; private Path cancerGeneCensusFile; private Path cancerHotspot; - private Path tso500File; - private Path eglhHaemOncFile; +// private Path tso500File; +// private Path eglhHaemOncFile; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); private static final String KNOWN_STATUS = "KNOWN"; @@ -134,8 +134,8 @@ public void check() throws Exception { // Check common files props = configuration.getDownload().getEnsembl().getUrl(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); - eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); +// tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); +// eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); @@ -186,7 +186,7 @@ public void parse() throws Exception { logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, - cancerGeneCensusFile, cancerHotspot, tso500File, eglhHaemOncFile); + cancerGeneCensusFile, cancerHotspot); logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); logger.info(PARSING_LOG_MESSAGE, gtfFile); @@ -660,14 +660,14 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId transcript.getFlags().add("LRG"); } // 3. TSO500 and EGLH HaemOnc - String tso500Flag = indexer.getTSO500(transcriptId.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(transcriptId.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } +// String tso500Flag = indexer.getTSO500(transcriptId.split("\\.")[0]); +// if (StringUtils.isNotEmpty(tso500Flag)) { +// transcript.getFlags().add(tso500Flag); +// } +// String eglhHaemOncFlag = indexer.getEGLHHaemOnc(transcriptId.split("\\.")[0]); +// if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { +// transcript.getFlags().add(eglhHaemOncFlag); +// } gene.getTranscripts().add(transcript); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 8542e76b1c..596c8b61c9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -32,8 +32,8 @@ public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { } public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, - Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot, Path tso500File, - Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { + Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot) + throws IOException, RocksDBException, FileFormatException, CellBaseException { indexManeMapping(maneFile, REFSEQ_DATA); indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); @@ -43,7 +43,7 @@ public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaF indexMiRTarBase(miRTarBaseFile); indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index 83eccb9885..752290e147 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -42,7 +42,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class RegulatoryFeatureBuilder extends CellBaseBuilder { +public class RegulatoryFeatureBuilder extends AbstractBuilder { private Path regulationPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index 5ffabf747b..041c52f522 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -36,7 +36,7 @@ /** * Created by fjlopez on 05/05/17. */ -public class RepeatsBuilder extends CellBaseBuilder { +public class RepeatsBuilder extends AbstractBuilder { private CellBaseConfiguration configuration; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 06f38f28f0..4f0dac0a81 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -32,7 +32,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; -public class RevelScoreBuilder extends CellBaseBuilder { +public class RevelScoreBuilder extends AbstractBuilder { private Path revelDownloadPath = null; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java index f0f08c65d3..bbd82344e7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java @@ -40,7 +40,7 @@ import static org.opencb.cellbase.lib.EtlCommons.MMSPLICE_DATA; import static org.opencb.cellbase.lib.EtlCommons.SPLICEAI_DATA; -public class SpliceBuilder extends CellBaseBuilder { +public class SpliceBuilder extends AbstractBuilder { private Path spliceDir; private CellBaseFileSerializer fileSerializer; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java index e44ce53e90..4a95b65757 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java @@ -20,7 +20,7 @@ import org.opencb.biodata.formats.variant.clinvar.rcv.v64jaxb.*; import org.opencb.cellbase.core.common.clinical.ClinvarPublicSet; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.cellbase.lib.builders.AbstractBuilder; import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; @@ -45,7 +45,7 @@ * Created by imedina on 26/09/14. */ @Deprecated -public class ClinVarParser extends CellBaseBuilder { +public class ClinVarParser extends AbstractBuilder { private static final String ASSEMBLY_PREFIX = "GRCh"; public static final String GRCH37_ASSEMBLY = "37"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index e3c7ab3ff8..e3b18cd147 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -22,7 +22,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.cellbase.lib.builders.AbstractBuilder; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; @@ -39,7 +39,7 @@ /** * Created by fjlopez on 26/09/16. */ -public class ClinicalVariantBuilder extends CellBaseBuilder { +public class ClinicalVariantBuilder extends AbstractBuilder { private final Path clinicalVariantPath; private final String assembly; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java index 0a8931b536..e103385556 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java @@ -16,7 +16,7 @@ package org.opencb.cellbase.lib.builders.clinical.variant; -import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.cellbase.lib.builders.AbstractBuilder; import org.opencb.cellbase.core.common.clinical.Cosmic; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; @@ -37,7 +37,7 @@ * @since October 08, 2014 */ @Deprecated -public class CosmicBuilder extends CellBaseBuilder { +public class CosmicBuilder extends AbstractBuilder { private final Path cosmicFilePath; private static final String CHROMOSOME = "CHR"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java deleted file mode 100644 index 0d3203e7e2..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.download; - -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; - -public class Downloader { - - private final String species; - private final String assembly; - private final Path outputDirectory; - private final CellBaseConfiguration configuration; - - public Downloader(String species, String assembly, Path outputDirectory, CellBaseConfiguration configuration) { - this.species = species; - this.assembly = assembly; - this.outputDirectory = outputDirectory; - this.configuration = configuration; - } - - public List downloadGenome() throws IOException, CellBaseException, InterruptedException { - GenomeDownloadManager manager = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadRepeats() throws IOException, CellBaseException, InterruptedException { - RepeatsDownloadManager manager = new RepeatsDownloadManager(species, assembly, outputDirectory, configuration); - return manager.downloadRepeats(); - } - - public List downloadGene() throws IOException, CellBaseException, InterruptedException { - GeneDownloadManager manager = new GeneDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadRegulation() throws IOException, CellBaseException, InterruptedException { - RegulationDownloadManager manager = new RegulationDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadProtein() throws IOException, CellBaseException, InterruptedException { - ProteinDownloadManager manager = new ProteinDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadConservation() throws IOException, CellBaseException, InterruptedException { - ConservationDownloadManager manager = new ConservationDownloadManager(species, assembly, outputDirectory, configuration); - return manager.downloadConservation(); - } - - public List downloadClinicalVariants() throws IOException, CellBaseException, InterruptedException { - ClinicalDownloadManager manager = new ClinicalDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadOntologies() throws IOException, CellBaseException, InterruptedException { - OntologyDownloadManager manager = new OntologyDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadCaddScores() throws IOException, CellBaseException, InterruptedException { - CaddDownloadManager manager = new CaddDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadPredictionScores() throws IOException, CellBaseException, InterruptedException { - MissenseScoresDownloadManager manager = new MissenseScoresDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadSpliceScores() throws IOException, CellBaseException, InterruptedException { - SpliceScoreDownloadManager manager = new SpliceScoreDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadPubMed() throws IOException, CellBaseException, InterruptedException { - PubMedDownloadManager manager = new PubMedDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } - - public List downloadPharmKGB() throws IOException, CellBaseException, InterruptedException { - PharmGKBDownloadManager manager = new PharmGKBDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 57eff8d865..7c0b41204b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -16,10 +16,12 @@ package org.opencb.cellbase.lib.download; +import org.opencb.cellbase.core.common.GitRepositoryState; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.utils.SpeciesUtils; +import org.opencb.commons.utils.DockerUtils; import java.io.IOException; import java.nio.file.Files; @@ -106,6 +108,8 @@ public List download() throws IOException, InterruptedException, C } private List downloadEnsemblData(Path ensemblDownloadPath) throws IOException, InterruptedException, CellBaseException { + downloadEnsemblCanonical(); + List downloadFiles = new ArrayList<>(); // Check if the species is supported @@ -133,6 +137,32 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws return downloadFiles; } + public void downloadEnsemblCanonical() throws IOException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); + Files.createDirectories(sequenceFolder); + + String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); + try { + // Build command line to run Perl script via docker image + // Output binding + AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( + sequenceFolder.toAbsolutePath().toString(), "/tmp"); + + // Params + String params = "/opt/cellbase/scripts/ensembl-scripts/ensembl_canonical.pl" + + " --species \"" + speciesConfiguration.getId() + "\"" + + " --outdir \"" + outputBinding.getValue() + "\""; + + // Execute perl script in docker + DockerUtils.run(dockerImage, null, outputBinding, params, null); + } catch (Exception e) { + throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + } + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + } + private List downloadRefSeq(Path refSeqDownloadPath) throws IOException, InterruptedException, CellBaseException { List downloadFiles = new ArrayList<>(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index ee67078f84..7fdec60e8f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -76,7 +76,7 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { // Params String params = "/opt/cellbase/scripts/ensembl-scripts/genome_info.pl" - + " --species \"Homo sapiens\"" + + " --species \"" + speciesConfiguration.getScientificName() + "\"" + " --outfile \"" + outputBinding.getValue() + "/genome_info.json\""; // Execute perl script in docker From c6bcbdd9e05a6fa3ac890f43f054f076374bd66d Mon Sep 17 00:00:00 2001 From: imedina Date: Thu, 4 Jul 2024 16:47:39 +0100 Subject: [PATCH 085/148] Gene downloader fixes --- .../lib/download/GeneDownloadManager.java | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 7c0b41204b..9f1132437d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -87,6 +87,8 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadGO(geneDownloadPath)); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + downloadEnsemblCanonical(); + // Save data sources manually downloaded // HPO saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), @@ -108,8 +110,6 @@ public List download() throws IOException, InterruptedException, C } private List downloadEnsemblData(Path ensemblDownloadPath) throws IOException, InterruptedException, CellBaseException { - downloadEnsemblCanonical(); - List downloadFiles = new ArrayList<>(); // Check if the species is supported @@ -133,7 +133,32 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); } + return downloadFiles; + } + + private List downloadRefSeq(Path refSeqDownloadPath) throws IOException, InterruptedException, CellBaseException { + List downloadFiles = new ArrayList<>(); + + // Check if the species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { + // GTF, DNA, RNA, Protein + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID)) { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); + + DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq(); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_FNA_FILE_ID, refSeqDownloadPath)); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_RNA_FNA_FILE_ID, refSeqDownloadPath)); + downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_PROTEIN_FAA_FILE_ID, refSeqDownloadPath)); + + // Save data source (i.e., metadata) + saveDataSource(REFSEQ_DATA, refSeqConfig.getVersion(), getTimeStamp(), getUrls(downloadFiles), + refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA))); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); + } + } return downloadFiles; } @@ -163,30 +188,6 @@ public void downloadEnsemblCanonical() throws IOException, CellBaseException { logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); } - private List downloadRefSeq(Path refSeqDownloadPath) throws IOException, InterruptedException, CellBaseException { - List downloadFiles = new ArrayList<>(); - - // Check if the species is supported - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); - - // GTF, DNA, RNA, Protein - String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq(); - downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); - downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_FNA_FILE_ID, refSeqDownloadPath)); - downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_RNA_FNA_FILE_ID, refSeqDownloadPath)); - downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_PROTEIN_FAA_FILE_ID, refSeqDownloadPath)); - - // Save data source (i.e., metadata) - saveDataSource(REFSEQ_DATA, refSeqConfig.getVersion(), getTimeStamp(), getUrls(downloadFiles), - refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA))); - - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); - } - return downloadFiles; - } - private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFile = null; @@ -329,7 +330,7 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) - || speciesConfiguration.getScientificName().equals("Mus musculus")) { + || speciesConfiguration.getScientificName().equals(MUS_MUSCULUS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); From 0a6a84fdf7eca3490555d4adb3ab9342ce0ec6c0 Mon Sep 17 00:00:00 2001 From: imedina Date: Fri, 5 Jul 2024 01:16:31 +0100 Subject: [PATCH 086/148] Add VariationDownloader --- .../executors/DownloadCommandExecutor.java | 3 ++ .../src/main/resources/configuration.yml | 3 +- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/GeneDownloadManager.java | 52 +++++++++---------- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 6c1f4194d5..1b7955b095 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -86,6 +86,9 @@ public void execute() throws CellBaseException { case REGULATION_DATA: downloader = new RegulationDownloadManager(species, assembly, outputDirectory, configuration); break; + case VARIATION_DATA: + downloader = new VariationDownloadManager(species, assembly, outputDirectory, configuration); + break; case VARIATION_FUNCTIONAL_SCORE_DATA: downloader = new CaddDownloadManager(species, assembly, outputDirectory, configuration); break; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 4dfd7f9a1b..1551c28525 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -336,6 +336,7 @@ species: - gene - protein - regulation + - variation - variation_functional_score - missense_variation_functional_score - clinical_variant @@ -355,7 +356,7 @@ species: - gene - regulation - protein -# - variation + - variation - id: rnorvegicus scientificName: Rattus norvegicus assemblies: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f2c8ffed15..9a34acb936 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -405,6 +405,7 @@ public final class EtlCommons { dataNamesMap.put(SPLICE_SCORE_DATA, "Splice Score"); dataNamesMap.put(MMSPLICE_DATA, "MMSplice"); dataNamesMap.put(SPLICEAI_DATA, "SpliceAI"); + dataNamesMap.put(VARIATION_DATA, "Variation"); // Populate data categories map diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 9f1132437d..eb6344596a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -70,6 +70,9 @@ public List download() throws IOException, InterruptedException, C // Ensembl downloadFiles.addAll(downloadEnsemblData(ensemblDownloadPath)); + // Ensembl canonical + downloadEnsemblCanonical(); + // RefSeq downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath)); @@ -87,25 +90,24 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadGO(geneDownloadPath)); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); - downloadEnsemblCanonical(); - // Save data sources manually downloaded - // HPO - saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), - Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), - geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); - logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), - getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); - - // Cancer gene census - saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), - Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)), - geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); - logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), - getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + // HPO + saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), + geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), + getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + + // Cancer gene census + saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)), + geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), + getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); - return downloadFiles; } @@ -210,8 +212,7 @@ private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, Inte if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA)); - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, - geneDownloadPath); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(LRG_DATA)); } @@ -225,8 +226,7 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, - geneDownloadPath); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(HGNC_DATA)); } @@ -255,8 +255,7 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, - geneDownloadPath); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(DGIDB_DATA)); } @@ -267,10 +266,10 @@ private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOExc DownloadFile downloadFile = null; // Check if the species is supported - if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); - String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), prefixId + UNIPROT_XREF_FILE_ID, UNIPROT_XREF_DATA, geneDownloadPath); @@ -329,11 +328,10 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) - || speciesConfiguration.getScientificName().equals(MUS_MUSCULUS_NAME)) { + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); - String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), prefixId + GO_ANNOTATION_FILE_ID, GO_ANNOTATION_DATA, geneDownloadPath); From 3dcad474ee8eff1999590bdb46650f1c05b8f651 Mon Sep 17 00:00:00 2001 From: imedina Date: Fri, 5 Jul 2024 01:16:41 +0100 Subject: [PATCH 087/148] Add VariationDownloader --- .../download/VariationDownloadManager.java | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java new file mode 100644 index 0000000000..4efcc0e0d7 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -0,0 +1,78 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class VariationDownloadManager extends AbstractDownloadManager { + + public VariationDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException, CellBaseException { + return downloadVariation(); + } + + public List downloadVariation() throws IOException, InterruptedException, CellBaseException { + List downloadFiles = new ArrayList<>(); + + // Check if species is supported + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA)) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA)); + + Path variationFolder = downloadFolder.resolve(VARIATION_DATA); + Files.createDirectories(variationFolder); + + // We do not need to download human variation data from Ensembl. It is already included in the CellBase. + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, speciesShortName + ".vcf.gz"); + String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString(); + String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" + + speciesShortName + ".vcf.gz"; + downloadFiles.add(downloadFile(url, fileName)); + logger.info(OK_LOG_MESSAGE); + saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), + variationFolder.resolve(getDataVersionFilename(VARIATION_DATA))); + + fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString(); + url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" + + speciesShortName + "_structural_variations.vcf.gz"; + downloadFiles.add(downloadFile(url, fileName)); + logger.info(OK_LOG_MESSAGE); + saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), + variationFolder.resolve(getDataVersionFilename(VARIATION_DATA))); + } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(VARIATION_DATA)); + } + + return downloadFiles; + } +} From 5eb33ae02bb064221954e3fe647acc588c7f63b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 8 Jul 2024 09:01:05 +0200 Subject: [PATCH 088/148] app: update Dockerfile for cellbase-builder in order to allow the script ensembl_canonical.pl to create sub-folders, #TASK-5575, #TASK-5564 On branch TASK-5564 Changes to be committed: modified: cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile --- cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index 5235637267..bcb2de9cb8 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -29,4 +29,7 @@ RUN cd /opt/ensembl && \ git clone https://github.com/Ensembl/ensembl-io.git && \ git clone --branch cvs/release-0_7 https://github.com/biomart/biomart-perl +## Give writting permissions to allow the script ensembl_canonical.pl to create sub-folder for cache purposes +RUN chmod -R 777 /opt/cellbase/scripts/ensembl-scripts/ + ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib From 2b226fe624f3e99208932556763e9646088c9ce8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 9 Jul 2024 09:19:05 +0200 Subject: [PATCH 089/148] lib: add variation to the EtlCommons dataVersionFilenamesMap, #TASK-5575, #TASK-5564 On branch TASK-5564 Changes to be committed: modified: cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java --- .../src/main/java/org/opencb/cellbase/lib/EtlCommons.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 9a34acb936..35cbba0e97 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -451,6 +451,7 @@ public final class EtlCommons { dataCategoriesMap.put(GWAS_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); dataCategoriesMap.put(MMSPLICE_DATA, dataNamesMap.get(SPLICE_SCORE_DATA)); dataCategoriesMap.put(SPLICEAI_DATA, dataNamesMap.get(SPLICE_SCORE_DATA)); + dataCategoriesMap.put(VARIATION_DATA, dataNamesMap.get(VARIATION_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); @@ -495,6 +496,7 @@ public final class EtlCommons { dataVersionFilenamesMap.put(GWAS_DATA, "gwas" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(MMSPLICE_DATA, "mmSplice" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(SPLICEAI_DATA, "spliceAi" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(VARIATION_DATA, "variation" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { From 5514177778513e48079d759545d1999339506dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Jul 2024 10:53:18 +0200 Subject: [PATCH 090/148] lib: remove unused variables, #TASK-5575, #TASK-5564 --- .../cellbase/lib/download/GeneDownloadManager.java | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index eb6344596a..3e2f104b89 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -32,19 +32,6 @@ public class GeneDownloadManager extends AbstractDownloadManager { - private static final Map GENE_UNIPROT_XREF_FILES; - - static { - GENE_UNIPROT_XREF_FILES = new HashMap<>(); - GENE_UNIPROT_XREF_FILES.put(HOMO_SAPIENS_NAME, "HUMAN_9606_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put(MUS_MUSCULUS_NAME, "MOUSE_10090_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put(RATTUS_NORVEGICUS_NAME, "RAT_10116_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put(DANIO_RERIO_NAME, "DANRE_7955_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put("Drosophila melanogaster", "DROME_7227_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put("Saccharomyces cerevisiae", "YEAST_559292_idmapping_selected.tab.gz"); - GENE_UNIPROT_XREF_FILES.put("Caenorhabditis elegans", "CAEEL_6239_idmapping_selected.tab.gz"); - } - public GeneDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); From ae9a8176b387197ed7c143007d69674fced1e8d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Jul 2024 11:45:35 +0200 Subject: [PATCH 091/148] core: add the field 'id' in DataSource model, #TASK-5575, #TASK-5564 --- .../cellbase/core/models/DataSource.java | 22 ++++++++++++++----- .../lib/download/AbstractDownloadManager.java | 2 +- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java index f716412a03..acc134cb63 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java @@ -21,6 +21,7 @@ public class DataSource { + private String id; private String name; private String category; private String version; @@ -31,7 +32,8 @@ public DataSource() { this.urls = new ArrayList<>(); } - public DataSource(String name, String category, String version, String downloadDate, List urls) { + public DataSource(String id, String name, String category, String version, String downloadDate, List urls) { + this.id = id; this.name = name; this.category = category; this.version = version; @@ -41,8 +43,9 @@ public DataSource(String name, String category, String version, String downloadD @Override public String toString() { - final StringBuilder sb = new StringBuilder("DataSourceDescr{"); - sb.append("name='").append(name).append('\''); + final StringBuilder sb = new StringBuilder("DataSource{"); + sb.append("id='").append(id).append('\''); + sb.append(", name='").append(name).append('\''); sb.append(", category='").append(category).append('\''); sb.append(", version='").append(version).append('\''); sb.append(", downloadDate='").append(downloadDate).append('\''); @@ -51,6 +54,15 @@ public String toString() { return sb.toString(); } + public String getId() { + return id; + } + + public DataSource setId(String id) { + this.id = id; + return this; + } + public String getName() { return name; } @@ -82,8 +94,8 @@ public String getDownloadDate() { return downloadDate; } - public DataSource setDownloadDate(String downloadedDate) { - this.downloadDate = downloadedDate; + public DataSource setDownloadDate(String downloadDate) { + this.downloadDate = downloadDate; return this; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index df57b06f8b..ff474a802b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -225,7 +225,7 @@ protected void saveDataSource(String data, String version, String date, List Date: Tue, 23 Jul 2024 11:49:59 +0200 Subject: [PATCH 092/148] core: update DGIdb in the configuration file, #TASK-5575, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 1551c28525..ca0ac4c099 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -109,10 +109,10 @@ download: files: CANCER_HOTSPOT: files/hotspots_v2.xls dgidb: - host: https://old.dgidb.org/ - version: "2022-02-01" + host: https://dgidb.org/ + version: "DGIdb v.5.0.7 (07/06/2024)" files: - DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv + DGIDB: data/latest/interactions.tsv geneUniprotXref: host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ version: "2024-03-27" From 9d2d4fe39d87fd068b04e084fa9e492161d98d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Jul 2024 16:39:30 +0200 Subject: [PATCH 093/148] lib: check if genome data is already downloaded before downloading to skip, #TASK-5575, #TASK-5564 --- .../lib/download/AbstractDownloadManager.java | 2 ++ .../lib/download/GenomeDownloadManager.java | 25 ++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index ff474a802b..b2a098f7e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -56,6 +56,8 @@ public abstract class AbstractDownloadManager { protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done."; protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; + protected static final String DATA_ALREADY_DOWNLOADED = "The file {} already exists, indicating that the data {} has already been" + + " downloaded."; protected String species; protected String assembly; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 7fdec60e8f..9028123298 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -25,6 +25,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.AbstractMap; +import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -32,9 +33,13 @@ public class GenomeDownloadManager extends AbstractDownloadManager { + private Path sequenceFolder; + public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); + + this.sequenceFolder = downloadFolder.resolve(GENOME_DATA); } @Override @@ -44,8 +49,14 @@ public List download() throws IOException, InterruptedException, C } public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { + Path genomeVersionFilePath = sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA)); + + if (Files.exists(genomeVersionFilePath)) { + logger.info(DATA_ALREADY_DOWNLOADED, genomeVersionFilePath.getFileName(), getDataName(GENOME_DATA)); + return new ArrayList<>(); + } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); - Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); Files.createDirectories(sequenceFolder); // Reference genome sequences are downloaded from Ensembl @@ -55,7 +66,7 @@ public List downloadReferenceGenome() throws IOException, Interrup // Save data source saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA))); + genomeVersionFilePath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); @@ -63,8 +74,14 @@ public List downloadReferenceGenome() throws IOException, Interrup } public void downloadGenomeInfo() throws IOException, CellBaseException { + String genomeInfoFilename = "genome_info.json"; + + if (Files.exists(sequenceFolder.resolve(genomeInfoFilename))) { + logger.info(DATA_ALREADY_DOWNLOADED, genomeInfoFilename, getDataName(GENOME_INFO_DATA)); + return; + } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); - Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); Files.createDirectories(sequenceFolder); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -77,7 +94,7 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { // Params String params = "/opt/cellbase/scripts/ensembl-scripts/genome_info.pl" + " --species \"" + speciesConfiguration.getScientificName() + "\"" - + " --outfile \"" + outputBinding.getValue() + "/genome_info.json\""; + + " --outfile \"" + outputBinding.getValue() + "/" + genomeInfoFilename + "\""; // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); From 299003ba7de4d3740c271e5a3c00e833537f55c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Jul 2024 17:17:40 +0200 Subject: [PATCH 094/148] lib: add the parameter 'assembly' to command line when calling the script genome_info.pl, #TASK-5575, #TASK-5564 --- cellbase-app/app/scripts/ensembl-scripts/genome_info.pl | 4 ++-- .../opencb/cellbase/lib/download/GenomeDownloadManager.java | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl index f229fdb47d..9cbc01a4cd 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl @@ -17,9 +17,9 @@ #################################################################### ## Parsing command line options #################################### #################################################################### -##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl --species "Mus musculus" --outfile /tmp +##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl --species "Mus musculus" --assembly GRCm39 --outfile /tmp -# USAGE: ./genome_info.pl --species "Homo sapiens" --outfile ../../appl_db/ird_v1/hsa ... +# USAGE: ./genome_info.pl --species "Homo sapiens" --assembly GRCh38 --outfile ../../appl_db/ird_v1/hsa ... ## Parsing command line GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'o|outfile=s' => \$outfile, 'phylo=s' => \$phylo, diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 9028123298..fa37411729 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -94,6 +94,7 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { // Params String params = "/opt/cellbase/scripts/ensembl-scripts/genome_info.pl" + " --species \"" + speciesConfiguration.getScientificName() + "\"" + + " --assembly \"" + assemblyConfiguration.getName() + "\"" + " --outfile \"" + outputBinding.getValue() + "/" + genomeInfoFilename + "\""; // Execute perl script in docker From 1d171d528d5b3bc27fba8281839456bdbdcb3cff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 09:09:58 +0200 Subject: [PATCH 095/148] lib: update GeneDownloadManager to call the script gene_extra_info.pl, #TASK-5575, #TASK-5564 --- .../ensembl-scripts/ensembl_canonical.pl | 17 +++++++- .../scripts/ensembl-scripts/genome_info.pl | 2 +- .../org/opencb/cellbase/lib/EtlCommons.java | 4 ++ .../lib/download/GeneDownloadManager.java | 40 +++++++++++++++---- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl index 9be361a55d..bed648e2d0 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl @@ -36,7 +36,12 @@ $query->formatter("TSV"); -open (ENSEMBL_CANONICAL, ">$outdir/ensembl_canonical.txt") || die "Cannot open ensembl_canonical.txt file"; +# Open the file for writing +open(my $fh, '>', "$outdir/ensembl_canonical.txt") or die "Cannot open ensembl_canonical.txt file: $!"; + +# Save the original stdout +my $original_stdout = *STDOUT; +open(STDOUT, '>&', $fh) or die "Can't redirect STDOUT: $!"; my $query_runner = BioMart::QueryRunner->new(); @@ -44,5 +49,13 @@ $query_runner->uniqueRowsOnly(1); $query_runner->execute($query); #$query_runner->printHeader(); -print ENSEMBL_CANONICAL $query_runner->printResults(); +#print ENSEMBL_CANONICAL $query_runner->printResults(); +# Call printResults which prints to STDOUT (now redirected to the file) +$query_runner->printResults(); #$query_runner->printFooter(); + +# Restore the original stdout +open(STDOUT, '>&', $original_stdout) or die "Can't restore STDOUT: $!"; + +# Close the filehandle +close($fh) or die "Failed to close file: $!"; \ No newline at end of file diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl index 9cbc01a4cd..8ecf3d7c8f 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl @@ -143,7 +143,7 @@ sub print_parameters { print "Parameters: "; - print "species: $species, outfile: $outfile, "; + print "species: $species, assembly: $assembly, outfile: $outfile, "; print "ensembl-registry: $ENSEMBL_REGISTRY, "; print "ensembl-host: $ENSEMBL_HOST, ensembl-port: $ENSEMBL_PORT, "; print "ensembl-user: $ENSEMBL_USER, verbose: $verbose, help: $help"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 35cbba0e97..dcec1f6de5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -110,6 +110,8 @@ public final class EtlCommons { public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; // Gene annotation + public static final String ENSEMBL_CANONICAL_DATA = "ensembl_canonical"; + public static final String GENE_EXTRA_INFO_DATA = "gene_extra_info"; // - MANE Select public static final String MANE_SELECT_DATA = "MANE Select"; // Must match the configuration file @@ -354,6 +356,8 @@ public final class EtlCommons { dataNamesMap.put(GENOME_DATA, "Genome"); dataNamesMap.put(GENOME_INFO_DATA, "Genome Info"); dataNamesMap.put(GENE_DATA, "Gene"); + dataNamesMap.put(ENSEMBL_CANONICAL_DATA, "Ensembl canonical"); + dataNamesMap.put(GENE_EXTRA_INFO_DATA, "Gene extra info"); dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); dataNamesMap.put(LRG_DATA, "LRG"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 3e2f104b89..356c637ca7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -58,7 +58,10 @@ public List download() throws IOException, InterruptedException, C downloadFiles.addAll(downloadEnsemblData(ensemblDownloadPath)); // Ensembl canonical - downloadEnsemblCanonical(); + downloadEnsemblCanonical(geneDownloadPath); + + // Gene extra info + downloadGeneExtraInfo(geneDownloadPath); // RefSeq downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath)); @@ -151,20 +154,43 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce return downloadFiles; } - public void downloadEnsemblCanonical() throws IOException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); - Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); - Files.createDirectories(sequenceFolder); + public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); try { // Build command line to run Perl script via docker image // Output binding AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( - sequenceFolder.toAbsolutePath().toString(), "/tmp"); + geneDownloadPath.toAbsolutePath().toString(), "/tmp"); // Params String params = "/opt/cellbase/scripts/ensembl-scripts/ensembl_canonical.pl" + + " --species \"" + speciesConfiguration.getId() + "\"" + + " --assembly \"" + assemblyConfiguration.getName() + "\"" + + " --outdir \"" + outputBinding.getValue() + "\""; + + // Execute perl script in docker + DockerUtils.run(dockerImage, null, outputBinding, params, null); + } catch (Exception e) { + throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + } + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); + } + + public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); + + String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); + try { + // Build command line to run Perl script via docker image + // Output binding + AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( + geneDownloadPath.toAbsolutePath().toString(), "/tmp"); + + // Params + String params = "/opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl" + " --species \"" + speciesConfiguration.getId() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; @@ -174,7 +200,7 @@ public void downloadEnsemblCanonical() throws IOException, CellBaseException { throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); } private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { From d10931d21f3c566b1e7fe1c85487084223b836a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 11:34:03 +0200 Subject: [PATCH 096/148] lib: improve genome and conservation downloaders by checking if data is already downloaded, and fix sonnar issues, #TASK-5575, #TASK-5576 --- .../lib/download/AbstractDownloadManager.java | 8 + .../download/ConservationDownloadManager.java | 147 +++++++++++------- .../lib/download/GenomeDownloadManager.java | 9 +- 3 files changed, 106 insertions(+), 58 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index b2a098f7e4..975e182cb7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -334,6 +334,14 @@ public static void writeDownloadLogFile(Path downloadFolder, List writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles); } + public boolean isAlreadyDownloaded(Path path, String dataName) { + if (Files.exists(path)) { + logger.info(DATA_ALREADY_DOWNLOADED, path.getFileName(), dataName); + return true; + } + return false; + } + private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) { long expectedFileSize = getExpectedFileSize(outputFileLog); long actualFileSize = FileUtils.sizeOf(new File(outputFileName)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java index f9a33b5c9c..64be42ed7c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java @@ -54,14 +54,26 @@ public List downloadConservation() throws IOException, Interrupted // Check if the species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CONSERVATION_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); // Create folders Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(GERP_DATA)); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); + Path gerpFolder = Files.createDirectories(conservationFolder.resolve(GERP_DATA)); + Path phastConsFolder = Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); + Path phyloPFolder = Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); + + // Already downloaded ? + boolean downloadGerp = !isAlreadyDownloaded(gerpFolder.resolve(getDataVersionFilename(GERP_DATA)), getDataName(GERP_DATA)); + boolean downloadPhastCons = !isAlreadyDownloaded(phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA)), + getDataName(PHASTCONS_DATA)); + boolean downloadPhyloP = !isAlreadyDownloaded(phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA)), + getDataName(PHYLOP_DATA)); + + if (!downloadGerp && !downloadPhastCons && !downloadPhyloP) { + return new ArrayList<>(); + } + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); // Download data String filename; @@ -80,30 +92,39 @@ public List downloadConservation() throws IOException, Interrupted String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"}; for (String chromosome : chromosomes) { - logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome); - String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) - + "chr" + chromosome + ".phastCons470way.wigFix.gz"; - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - phastconsUrls.add(phastConsUrl); - - logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome); - String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) - + "chr" + chromosome + ".phyloP470way.wigFix.gz"; - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - phyloPUrls.add(phyloPUrl); + if (downloadPhastCons) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); + String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons470way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + phastconsUrls.add(phastConsUrl); + logger.info(OK_LOG_MESSAGE); + } + + if (downloadPhyloP) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP470way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + phyloPUrls.add(phyloPUrl); + logger.info(OK_LOG_MESSAGE); + } } // 2. Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - gerpUrl = configuration.getDownload().getGerp().getHost() - + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + if (downloadGerp) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + } } // Mouse @@ -114,43 +135,63 @@ public List downloadConservation() throws IOException, Interrupted String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "X", "Y", "M"}; for (String chromosome : chromosomes) { - logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome); - String phastConsUrl = phastconsHost - + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) - + "chr" + chromosome + ".phastCons35way.wigFix.gz"; - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - phastconsUrls.add(phastConsUrl); - - logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome); - String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) - + "chr" + chromosome + ".phyloP35way.wigFix.gz"; - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - phyloPUrls.add(phyloPUrl); + if (downloadPhastCons) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); + String phastConsUrl = phastconsHost + + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons35way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + phastconsUrls.add(phastConsUrl); + logger.info(OK_LOG_MESSAGE); + } + + if (downloadPhyloP) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP35way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + phyloPUrls.add(phyloPUrl); + logger.info(OK_LOG_MESSAGE); + } } // 2. Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - gerpUrl = configuration.getDownload().getGerp().getHost() - + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + if (downloadGerp) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + } } // Save data version - saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, - conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); - saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, - conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); - saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), - Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA))); + if (downloadPhastCons) { + saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, + phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); + } + if (downloadPhyloP) { + saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, + phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); + } + if (downloadGerp) { + saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + Collections.singletonList(gerpUrl), gerpFolder.resolve(getDataVersionFilename(GERP_DATA))); + } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } return downloadFiles; } + private String getChromDownloadMessage(String dataName, String chromosome) { + return dataName + ", chrom. " + chromosome; + } + } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index fa37411729..e5ad0d8824 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -51,8 +51,8 @@ public List download() throws IOException, InterruptedException, C public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { Path genomeVersionFilePath = sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA)); - if (Files.exists(genomeVersionFilePath)) { - logger.info(DATA_ALREADY_DOWNLOADED, genomeVersionFilePath.getFileName(), getDataName(GENOME_DATA)); + // Already downloaded + if (isAlreadyDownloaded(genomeVersionFilePath, getDataName(GENOME_DATA))) { return new ArrayList<>(); } @@ -76,8 +76,8 @@ public List downloadReferenceGenome() throws IOException, Interrup public void downloadGenomeInfo() throws IOException, CellBaseException { String genomeInfoFilename = "genome_info.json"; - if (Files.exists(sequenceFolder.resolve(genomeInfoFilename))) { - logger.info(DATA_ALREADY_DOWNLOADED, genomeInfoFilename, getDataName(GENOME_INFO_DATA)); + // Already downloaded + if (isAlreadyDownloaded(sequenceFolder.resolve(genomeInfoFilename), getDataName(GENOME_INFO_DATA))) { return; } @@ -105,5 +105,4 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); } - } From b422f3abb77fec8576db8937cea2405943d4edbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 11:58:00 +0200 Subject: [PATCH 097/148] lib: improve repeats downloaders by checking if data is already downloaded, #TASK-5575, #TASK-5576 --- .../lib/download/RepeatsDownloadManager.java | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java index 0122893833..77a8f160f7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java @@ -46,47 +46,69 @@ public List downloadRepeats() throws IOException, InterruptedExcep // Check if species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REPEATS_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); Files.createDirectories(repeatsFolder); + Path trfFolder = Files.createDirectories(repeatsFolder.resolve(TRF_DATA)); + Path wmFolder = Files.createDirectories(repeatsFolder.resolve(WM_DATA)); + Path gsdFolder = Files.createDirectories(repeatsFolder.resolve(GSD_DATA)); String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + // Already downloaded ? + boolean downloadTrf = !isAlreadyDownloaded(trfFolder.resolve(getDataVersionFilename(TRF_DATA)), getDataName(TRF_DATA)) + && configuration.getDownload().getSimpleRepeats().getFiles().containsKey(prefixId + SIMPLE_REPEATS_FILE_ID); + boolean downloadWm = !isAlreadyDownloaded(wmFolder.resolve(getDataVersionFilename(WM_DATA)), getDataName(WM_DATA)) + && configuration.getDownload().getWindowMasker().getFiles().containsKey(prefixId + WINDOW_MASKER_FILE_ID); + boolean downloadGsd = !isAlreadyDownloaded(gsdFolder.resolve(getDataVersionFilename(GSD_DATA)), getDataName(GSD_DATA)) + && configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + GENOMIC_SUPER_DUPS_FILE_ID); + + if (!downloadTrf && !downloadWm && !downloadGsd) { + return new ArrayList<>(); + } + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + // Download tandem repeat finder - if (configuration.getDownload().getSimpleRepeats().getFiles().containsKey(prefixId + SIMPLE_REPEATS_FILE_ID)) { + if (downloadTrf) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(TRF_DATA)); String url = configuration.getDownload().getSimpleRepeats().getHost() + configuration.getDownload().getSimpleRepeats().getFiles().get(prefixId + SIMPLE_REPEATS_FILE_ID); Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); logger.info(OK_LOG_MESSAGE); + saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); + Collections.singletonList(url), trfFolder.resolve(getDataVersionFilename(TRF_DATA))); } // Download WindowMasker - if (configuration.getDownload().getWindowMasker().getFiles().containsKey(prefixId + WINDOW_MASKER_FILE_ID)) { + if (downloadWm) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(WM_DATA)); String url = configuration.getDownload().getWindowMasker().getHost() + configuration.getDownload().getWindowMasker().getFiles().get(prefixId + WINDOW_MASKER_FILE_ID); Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); logger.info(OK_LOG_MESSAGE); + saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); + Collections.singletonList(url), wmFolder.resolve(getDataVersionFilename(WM_DATA))); } // Download genomic super duplications - if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + GENOMIC_SUPER_DUPS_FILE_ID)) { + if (downloadGsd) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GSD_DATA)); String url = configuration.getDownload().getGenomicSuperDups().getHost() + configuration.getDownload().getGenomicSuperDups().getFiles().get(prefixId + GENOMIC_SUPER_DUPS_FILE_ID); Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); logger.info(OK_LOG_MESSAGE); + saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); + Collections.singletonList(url), gsdFolder.resolve(getDataVersionFilename(GSD_DATA))); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); From d0c0ba3cdc1e61eb8281a46b9d410b527002047d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 13:30:55 +0200 Subject: [PATCH 098/148] lib: improve regulation downloader by checking if data is already downloaded, #TASK-5575, #TASK-5576 --- .../download/RegulationDownloadManager.java | 81 +++++++++++++++---- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index a08fd8c600..ecb2c8e1e6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -32,6 +32,10 @@ public class RegulationDownloadManager extends AbstractDownloadManager { private Path regulationFolder; + private Path regulatoryBuildFolder; + private Path motifFeaturesFolder; + private Path mirTarBaseFolder; + private Path mirBaseFolder; public RegulationDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -44,13 +48,46 @@ public List download() throws IOException, InterruptedException, C // Check if species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REGULATION_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); regulationFolder = downloadFolder.resolve(REGULATION_DATA); Files.createDirectories(regulationFolder); + regulatoryBuildFolder = Files.createDirectories(regulationFolder.resolve(REGULATORY_BUILD_DATA)); + motifFeaturesFolder = Files.createDirectories(regulationFolder.resolve(MOTIF_FEATURES_DATA)); + mirTarBaseFolder = Files.createDirectories(regulationFolder.resolve(MIRTARBASE_DATA)); + mirBaseFolder = Files.createDirectories(regulationFolder.resolve(MIRBASE_DATA)); + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + // Already downloaded ? + boolean downloadRegulatoryBuild = !isAlreadyDownloaded(regulatoryBuildFolder.resolve(getDataVersionFilename( + REGULATORY_BUILD_DATA)), getDataName(REGULATORY_BUILD_DATA)); + boolean downloadMotifFeatures = !isAlreadyDownloaded(motifFeaturesFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)), + getDataName(MOTIF_FEATURES_DATA)); + boolean downloadMirTarBase = !isAlreadyDownloaded(mirTarBaseFolder.resolve(getDataVersionFilename(MIRTARBASE_DATA)), + getDataName(MIRTARBASE_DATA)) && configuration.getDownload().getMiRTarBase().getFiles().containsKey(prefixId + + MIRTARBASE_FILE_ID); + boolean downloadMirBase = !isAlreadyDownloaded(mirBaseFolder.resolve(getDataVersionFilename(MIRBASE_DATA)), + getDataName(MIRBASE_DATA)); + + if (!downloadRegulatoryBuild && !downloadMotifFeatures && !downloadMirTarBase && !downloadMirBase) { + return new ArrayList<>(); + } + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); - downloadFiles.addAll(downloadRegulatoryaAndMotifFeatures()); - downloadFiles.add(downloadMiRTarBase()); - downloadFiles.add(downloadMirna()); + if (downloadRegulatoryBuild) { + downloadFiles.addAll(downloadRegulatoryaBuild()); + } + + if (downloadMotifFeatures) { + downloadFiles.addAll(downloadMotifFeatures()); + } + + if (downloadMirTarBase) { + downloadFiles.add(downloadMiRTarBase()); + } + if (downloadMirBase) { + downloadFiles.add(downloadMirna()); + } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); } @@ -59,19 +96,35 @@ public List download() throws IOException, InterruptedException, C } /** - * Downloads Ensembl regulatory build and motif feature files. + * Downloads Ensembl regulatory build. * @throws IOException Any issue when writing files * @throws InterruptedException Any issue downloading files */ - private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException, CellBaseException { + private List downloadRegulatoryaBuild() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATORY_BUILD_DATA)); + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Regulatory build downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_REGULATORY_BUILD_FILE_ID, - REGULATORY_BUILD_DATA, regulationFolder); + REGULATORY_BUILD_DATA, regulatoryBuildFolder); downloadFiles.add(downloadFile); + return downloadFiles; + } + + /** + * Downloads Ensembl motif feature files. + * @throws IOException Any issue when writing files + * @throws InterruptedException Any issue downloading files + */ + private List downloadMotifFeatures() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MOTIF_FEATURES_DATA)); + + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); + // Motifs features List urls = new ArrayList<>(); downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_FILE_ID, null, @@ -87,7 +140,7 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept // Save data source (name, category, version,...) saveDataSource(MOTIF_FEATURES_DATA, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), urls, - regulationFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); + motifFeaturesFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); return downloadFiles; } @@ -95,11 +148,7 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRBASE_DATA)); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_DATA, - regulationFolder); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRBASE_DATA)); - return downloadFile; + return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_DATA, mirBaseFolder); } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException, CellBaseException { @@ -108,10 +157,8 @@ private DownloadFile downloadMiRTarBase() throws IOException, InterruptedExcepti if (configuration.getDownload().getMiRTarBase().getFiles().containsKey(prefixId + MIRTARBASE_FILE_ID)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), - prefixId + MIRTARBASE_FILE_ID, MIRTARBASE_DATA, regulationFolder); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), prefixId + MIRTARBASE_FILE_ID, + MIRTARBASE_DATA, mirTarBaseFolder); } return downloadFile; } From 1dc504f9eec2b87a6b6103408a491dca59d3bdb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 16:06:20 +0200 Subject: [PATCH 099/148] lib: fix motif features folder for regulation downloader, #TASK-5575, #TASK-5564 --- .../cellbase/lib/download/RegulationDownloadManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index ecb2c8e1e6..b8bd191481 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -128,13 +128,13 @@ private List downloadMotifFeatures() throws IOException, Interrupt // Motifs features List urls = new ArrayList<>(); downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_FILE_ID, null, - regulationFolder); + motifFeaturesFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // And now the index file downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID, null, - regulationFolder); + motifFeaturesFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); From 4ba788d8fe0ad0646bcb5328a41260a531884f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 16:10:50 +0200 Subject: [PATCH 100/148] lib: fix minor sonnar issue, #TASK-5575, #TASK-5564 --- .../cellbase/lib/download/RegulationDownloadManager.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index b8bd191481..36b3aef688 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -31,7 +31,6 @@ public class RegulationDownloadManager extends AbstractDownloadManager { - private Path regulationFolder; private Path regulatoryBuildFolder; private Path motifFeaturesFolder; private Path mirTarBaseFolder; @@ -48,7 +47,7 @@ public List download() throws IOException, InterruptedException, C // Check if species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REGULATION_DATA)) { - regulationFolder = downloadFolder.resolve(REGULATION_DATA); + Path regulationFolder = downloadFolder.resolve(REGULATION_DATA); Files.createDirectories(regulationFolder); regulatoryBuildFolder = Files.createDirectories(regulationFolder.resolve(REGULATORY_BUILD_DATA)); motifFeaturesFolder = Files.createDirectories(regulationFolder.resolve(MOTIF_FEATURES_DATA)); From 6fc712964b96c4ba0603f728d2c09e9a72ea2594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 16:24:06 +0200 Subject: [PATCH 101/148] lib: improve protein downloader by checking if data is already downloaded, #TASK-5575, #TASK-5564 --- .../lib/download/ProteinDownloadManager.java | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index d32d3100be..29719208fa 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -48,26 +48,49 @@ public List download() throws IOException, InterruptedException, C // Check if the species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PROTEIN_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); Files.createDirectories(proteinFolder); + Path uniProtFolder = Files.createDirectories(proteinFolder.resolve(UNIPROT_DATA)); + Path interProFolder = Files.createDirectories(proteinFolder.resolve(INTERPRO_DATA)); + Path intactFolder = Files.createDirectories(proteinFolder.resolve(INTACT_DATA)); + + // Already downloaded ? + boolean downloadUniProt = !isAlreadyDownloaded(uniProtFolder.resolve(getDataVersionFilename(UNIPROT_DATA)), + getDataName(UNIPROT_DATA)); + boolean downloadInterPro = !isAlreadyDownloaded(interProFolder.resolve(getDataVersionFilename(INTERPRO_DATA)), + getDataName(INTERPRO_DATA)); + boolean downloadIntact = !isAlreadyDownloaded(intactFolder.resolve(getDataVersionFilename(INTACT_DATA)), + getDataName(INTACT_DATA)); + + if (!downloadUniProt && !downloadInterPro && !downloadIntact) { + return new ArrayList<>(); + } + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); + DownloadFile downloadFile; // Uniprot - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), - UNIPROT_FILE_ID, UNIPROT_DATA, proteinFolder); - downloadFiles.add(downloadFile); + if (downloadUniProt) { + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_DATA, + uniProtFolder); + downloadFiles.add(downloadFile); + } // InterPro - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), - INTERPRO_FILE_ID, INTERPRO_DATA, proteinFolder); - downloadFiles.add(downloadFile); + if (downloadInterPro) { + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_DATA, + interProFolder); + downloadFiles.add(downloadFile); + } // Intact - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), - INTACT_FILE_ID, INTACT_DATA, proteinFolder); - downloadFiles.add(downloadFile); + if (downloadIntact) { + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_DATA, + intactFolder); + downloadFiles.add(downloadFile); + } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); } From 8ed0e0d326e9d25fd9aebc9b70207a0144cbec2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 16:38:35 +0200 Subject: [PATCH 102/148] lib: improve variation downloader by checking if data is already downloaded, #TASK-5575, #TASK-5564 --- .../download/VariationDownloadManager.java | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java index 4efcc0e0d7..62e9eb8a34 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -24,7 +24,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -45,34 +44,41 @@ public List downloadVariation() throws IOException, InterruptedExc List downloadFiles = new ArrayList<>(); // Check if species is supported - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA)); - + // and we do not need to download human variation data from Ensembl. It is already included in the CellBase. + if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA) + && !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { Path variationFolder = downloadFolder.resolve(VARIATION_DATA); Files.createDirectories(variationFolder); - // We do not need to download human variation data from Ensembl. It is already included in the CellBase. - if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, speciesShortName + ".vcf.gz"); - String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString(); - String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" - + speciesShortName + ".vcf.gz"; - downloadFiles.add(downloadFile(url, fileName)); - logger.info(OK_LOG_MESSAGE); - saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), - variationFolder.resolve(getDataVersionFilename(VARIATION_DATA))); - - fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString(); - url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" - + speciesShortName + "_structural_variations.vcf.gz"; - downloadFiles.add(downloadFile(url, fileName)); - logger.info(OK_LOG_MESSAGE); - saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), - variationFolder.resolve(getDataVersionFilename(VARIATION_DATA))); + if (isAlreadyDownloaded(downloadFolder.resolve(getDataVersionFilename(VARIATION_DATA)), getDataName(VARIATION_DATA))) { + return new ArrayList<>(); } + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA)); + + List urls = new ArrayList<>(); + + String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString(); + String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" + + speciesShortName + ".vcf.gz"; + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, fileName); + downloadFiles.add(downloadFile(url, fileName)); + urls.add(url); + logger.info(OK_LOG_MESSAGE); + + fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString(); + url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" + + speciesShortName + "_structural_variations.vcf.gz"; + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, fileName); + downloadFiles.add(downloadFile(url, fileName)); + urls.add(url); + logger.info(OK_LOG_MESSAGE); + + saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), urls, variationFolder.resolve( + getDataVersionFilename(VARIATION_DATA))); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(VARIATION_DATA)); } - return downloadFiles; } } From 144276672487b8f5a0bf14d58ba88956e6d5c083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 18:01:18 +0200 Subject: [PATCH 103/148] lib: fix variation folder in downloader, #TASK-5575, #TASK-5564 --- .../opencb/cellbase/lib/download/VariationDownloadManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java index 62e9eb8a34..376369eedd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -50,7 +50,7 @@ public List downloadVariation() throws IOException, InterruptedExc Path variationFolder = downloadFolder.resolve(VARIATION_DATA); Files.createDirectories(variationFolder); - if (isAlreadyDownloaded(downloadFolder.resolve(getDataVersionFilename(VARIATION_DATA)), getDataName(VARIATION_DATA))) { + if (isAlreadyDownloaded(variationFolder.resolve(getDataVersionFilename(VARIATION_DATA)), getDataName(VARIATION_DATA))) { return new ArrayList<>(); } From e48d27d95fdc1f7c49da040b72fe9171853ca104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 18:34:41 +0200 Subject: [PATCH 104/148] core: remove DISGENET, #TASK-5575, #TASK-5564 --- .../core/config/DownloadProperties.java | 20 ------------------- .../src/main/resources/configuration.yml | 5 ----- 2 files changed, 25 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 7d9adfac3c..84282a277c 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -51,8 +51,6 @@ public class DownloadProperties { private URLProperties windowMasker; private URLProperties genomicSuperDups; private URLProperties hpo; - private URLProperties disgenet; - private URLProperties disgenetReadme; private URLProperties dgidb; private URLProperties cancerGeneCensus; private URLProperties gwasCatalog; @@ -283,24 +281,6 @@ public DownloadProperties setHpo(URLProperties hpo) { return this; } - public URLProperties getDisgenet() { - return disgenet; - } - - public DownloadProperties setDisgenet(URLProperties disgenet) { - this.disgenet = disgenet; - return this; - } - - public URLProperties getDisgenetReadme() { - return disgenetReadme; - } - - public DownloadProperties setDisgenetReadme(URLProperties disgenetReadme) { - this.disgenetReadme = disgenetReadme; - return this; - } - public URLProperties getDgidb() { return dgidb; } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index ca0ac4c099..982c46846c 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -135,11 +135,6 @@ download: version: "2024-04-26" files: HPO: "manual@phenotype_to_genes.txt" - disgenet: - host: https://www.disgenet.org/ - version: "7.0 (January 2020)" - files: - DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz gnomadConstraints: host: https://storage.googleapis.com/ version: "2.1.1" From 642935a7f6d0e9cfaf3bf4fed1c419eacb186b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 19:04:42 +0200 Subject: [PATCH 105/148] lib: improve gene downloader, removing DISGENET, fixing sonnar issues, #TASK-5575, #TASK-5564 --- .../ensembl-scripts/gene_extra_info.pl | 2 +- .../lib/download/GeneDownloadManager.java | 119 +++++++++++------- 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl index d227b4c1c5..22b6a825b2 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl @@ -18,7 +18,7 @@ #################################################################### ##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp -# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ... +# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --assembly "GRCh38" --outdir ../../appl_db/ird_v1/hsa ... ## Parsing command line GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'outdir=s' => \$outdir, 'phylo=s' => \$phylo, diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 356c637ca7..faa3e3b2f2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -26,7 +26,10 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -75,7 +78,6 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadDrugData(geneDownloadPath)); downloadFiles.add(downloadGeneUniprotXref(geneDownloadPath)); downloadFiles.add(downloadGeneExpressionAtlas(geneDownloadPath)); - downloadFiles.add(downloadGeneDiseaseAnnotation(geneDownloadPath)); downloadFiles.add(downloadGnomadConstraints(geneDownloadPath)); downloadFiles.add(downloadGO(geneDownloadPath)); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); @@ -83,18 +85,26 @@ public List download() throws IOException, InterruptedException, C // Save data sources manually downloaded if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { // HPO - saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), - Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), - geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); - logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), - getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)))) { + logger.warn("The version file {} already exists", getDataVersionFilename(HPO_DISEASE_DATA)); + } else { + saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), + geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), + getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + } // Cancer gene census - saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), - Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)), - geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); - logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), - getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)))) { + logger.warn("The version file {} already exists", getDataVersionFilename(CANCER_GENE_CENSUS_DATA)); + } else { + saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), + CANCER_GENE_CENSUS_FILE_ID)), geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), + getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + } } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); @@ -106,6 +116,11 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws // Check if the species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { + // Already downloaded ? + if (isAlreadyDownloaded(ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA)), getDataName(ENSEMBL_DATA))) { + return downloadFiles; + } + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); DownloadProperties.EnsemblProperties ensemblConfig = configuration.getDownload().getEnsembl(); @@ -135,7 +150,8 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { // GTF, DNA, RNA, Protein String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID)) { + if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID) + && !isAlreadyDownloaded(refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA)), getDataName(REFSEQ_DATA))) { logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq(); @@ -155,6 +171,14 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce } public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, CellBaseException { + String ensemblCanonicalScript = "ensembl_canonical.pl"; + String ensemblCanonicalFilename = "ensembl_canonical.txt"; + + if (Files.exists(geneDownloadPath.resolve(ensemblCanonicalFilename))) { + logger.warn("File {} already exists, skipping running the Perl script {}", ensemblCanonicalFilename, ensemblCanonicalScript); + return; + } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -165,7 +189,7 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, geneDownloadPath.toAbsolutePath().toString(), "/tmp"); // Params - String params = "/opt/cellbase/scripts/ensembl-scripts/ensembl_canonical.pl" + String params = "/opt/cellbase/scripts/ensembl-scripts/" + ensemblCanonicalScript + " --species \"" + speciesConfiguration.getId() + "\"" + " --assembly \"" + assemblyConfiguration.getName() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; @@ -173,13 +197,24 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); } catch (Exception e) { - throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + logger.error("{}", e.getStackTrace()); +// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); } public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, CellBaseException { + String geneExtraInfoScript = "gene_extra_info.pl"; + String descriptionFilename = "description.txt"; + String xrefsFilename = "xrefs.txt"; + + if (Files.exists(geneDownloadPath.resolve(descriptionFilename)) && Files.exists(geneDownloadPath.resolve(xrefsFilename))) { + logger.warn("Files {} and {} already exist, skipping running the Perl script {}", descriptionFilename, xrefsFilename, + geneExtraInfoScript); + return; + } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -190,14 +225,16 @@ public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, Cel geneDownloadPath.toAbsolutePath().toString(), "/tmp"); // Params - String params = "/opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl" - + " --species \"" + speciesConfiguration.getId() + "\"" + String params = "/opt/cellbase/scripts/ensembl-scripts/" + geneExtraInfoScript + + " --species \"" + speciesConfiguration.getScientificName() + "\"" + + " --assembly \"" + assemblyConfiguration.getName() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); } catch (Exception e) { - throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + logger.error("{}", e.getStackTrace()); +// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); @@ -207,7 +244,9 @@ private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(MANE_SELECT_DATA)), + getDataName(MANE_SELECT_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, @@ -236,7 +275,8 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(HGNC_DATA)), getDataName(HGNC_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath); @@ -250,7 +290,9 @@ private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOExcep DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(CANCER_HOTSPOT_DATA)), + getDataName(CANCER_HOTSPOT_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, @@ -265,7 +307,8 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(DGIDB_DATA)), getDataName(DGIDB_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath); @@ -280,7 +323,9 @@ private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOExc // Check if the species is supported String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)) { + if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(UNIPROT_XREF_DATA)), + getDataName(UNIPROT_XREF_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), @@ -295,7 +340,9 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GENE_EXPRESSION_ATLAS_DATA)), + getDataName(GENE_EXPRESSION_ATLAS_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), @@ -306,27 +353,13 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I return downloadFile; } - private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { - DownloadFile downloadFile = null; - - // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); - - // DisGeNet - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), - DISGENET_FILE_ID, DISGENET_DATA, geneDownloadPath); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); - } - return downloadFile; - } - private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GNOMAD_CONSTRAINTS_DATA)), + getDataName(GNOMAD_CONSTRAINTS_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), @@ -342,7 +375,9 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter // Check if the species is supported String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)) { + if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GO_ANNOTATION_DATA)), + getDataName(GO_ANNOTATION_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), From 8030b02b70422a9b7d273abb550740dd8b7e0c05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Jul 2024 00:01:36 +0200 Subject: [PATCH 106/148] lib: fix command line to execute Perl script, #TASK-5575, #TASK-5564 --- .../org/opencb/cellbase/lib/download/GeneDownloadManager.java | 1 - 1 file changed, 1 deletion(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index faa3e3b2f2..4d225d5eb9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -191,7 +191,6 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, // Params String params = "/opt/cellbase/scripts/ensembl-scripts/" + ensemblCanonicalScript + " --species \"" + speciesConfiguration.getId() + "\"" - + " --assembly \"" + assemblyConfiguration.getName() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; // Execute perl script in docker From e17e51d3d26cc0a7e15ffc008d7074d2d641869d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Jul 2024 09:17:01 +0200 Subject: [PATCH 107/148] lib: add files generated by scripts in the version JSON files, #TASK-5575, #TASK-5564 --- .../src/main/resources/configuration.yml | 14 ++--- .../org/opencb/cellbase/lib/EtlCommons.java | 15 +----- .../lib/download/AbstractDownloadManager.java | 5 +- .../lib/download/GeneDownloadManager.java | 52 ++++++++++--------- .../lib/download/GenomeDownloadManager.java | 9 +++- 5 files changed, 41 insertions(+), 54 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 982c46846c..29bf940175 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -49,16 +49,10 @@ download: REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" - # To be generated manually - DESCRIPTION: "manual@description.txt" - # To be generated manually - XREFS: "manual@xrefs.txt" - # To be downloaded manually -# HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" -# # To be downloaded manually -# TSO500: "manual@TSO500_transcripts.txt" - # To be downloaded manually -# CANONICAL: "manual@ensembl_canonical.txt" + DESCRIPTION: "script:gene_extra_info.pl@description.txt" + XREFS: "script:gene_extra_info.pl@xrefs.txt" + CANONICAL: "script:ensembl_canonical.pl@ensembl_canonical.txt" + GENOME_INFO: "script:genome_info.pl@genome_info.json" ensemblGenomes: database: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index dcec1f6de5..0c95580205 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -58,6 +58,7 @@ public final class EtlCommons { public static final String HG19_NAME = "hg19"; public static final String MANUAL_PREFIX = "manual@"; + public static final String SCRIPT_PREFIX = "script:"; public static final String SUFFIX_VERSION_FILENAME = "Version.json"; @@ -86,9 +87,8 @@ public final class EtlCommons { public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String ENSEMBL_DESCRIPTION_FILE_ID = "DESCRIPTION"; public static final String ENSEMBL_XREFS_FILE_ID = "XREFS"; - public static final String ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID = "HAEM_ONC_TRANSCRIPTS"; - public static final String ENSEMBL_TSO500_FILE_ID = "TSO500"; public static final String ENSEMBL_CANONICAL_FILE_ID = "CANONICAL"; + public static final String GENOME_INFO_FILE_ID = "GENOME_INFO"; // Genome public static final String GENOME_DATA = "genome"; @@ -704,17 +704,6 @@ public static List getUrls(List downloadFiles) { return downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()); } - public static List getManualUrls(DownloadProperties.URLProperties props) { - List urls = new ArrayList<>(); - for (String value : props.getFiles().values()) { - String url = getManualUrl(props.getHost(), value); - if (StringUtils.isNotEmpty(url)) { - urls.add(url); - } - } - return urls; - } - public static String getManualUrl(DownloadProperties.URLProperties props, String fileId) { return getManualUrl(props.getHost(), props.getFiles().get(fileId)); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 975e182cb7..3f852ec8ad 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -235,8 +235,7 @@ protected void saveDataSource(String data, String version, String date, List downloadEnsemblData(Path ensemblDownloadPath) throws // Save data source (i.e., metadata) List urls = getUrls(downloadFiles); - // Add manually downloaded files - urls.addAll(getManualUrls(ensemblConfig.getUrl())); + // Add files created by scripts + urls.add(getManualUrl(configuration.getDownload().getEnsembl().getUrl(), ENSEMBL_DESCRIPTION_FILE_ID)); + urls.add(getManualUrl(configuration.getDownload().getEnsembl().getUrl(), ENSEMBL_XREFS_FILE_ID)); + urls.add(getManualUrl(configuration.getDownload().getEnsembl().getUrl(), ENSEMBL_CANONICAL_FILE_ID)); saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), urls, ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); @@ -182,22 +184,22 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); - try { - // Build command line to run Perl script via docker image - // Output binding - AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( - geneDownloadPath.toAbsolutePath().toString(), "/tmp"); - // Params - String params = "/opt/cellbase/scripts/ensembl-scripts/" + ensemblCanonicalScript - + " --species \"" + speciesConfiguration.getId() + "\"" - + " --outdir \"" + outputBinding.getValue() + "\""; + // Build command line to run Perl script via docker image + // Output binding + AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( + geneDownloadPath.toAbsolutePath().toString(), "/tmp"); + + // Params + String params = "/opt/cellbase/scripts/ensembl-scripts/" + ensemblCanonicalScript + + " --species \"" + speciesConfiguration.getId() + "\"" + + " --outdir \"" + outputBinding.getValue() + "\""; + try { // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); } catch (Exception e) { - logger.error("{}", e.getStackTrace()); -// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + logger.error("Error executing script {}: {}", params, e.getStackTrace()); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); @@ -217,23 +219,23 @@ public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, Cel logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); - try { - // Build command line to run Perl script via docker image - // Output binding - AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( - geneDownloadPath.toAbsolutePath().toString(), "/tmp"); - // Params - String params = "/opt/cellbase/scripts/ensembl-scripts/" + geneExtraInfoScript - + " --species \"" + speciesConfiguration.getScientificName() + "\"" - + " --assembly \"" + assemblyConfiguration.getName() + "\"" - + " --outdir \"" + outputBinding.getValue() + "\""; + // Build command line to run Perl script via docker image + // Output binding + AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( + geneDownloadPath.toAbsolutePath().toString(), "/tmp"); + + // Params + String params = "/opt/cellbase/scripts/ensembl-scripts/" + geneExtraInfoScript + + " --species \"" + speciesConfiguration.getScientificName() + "\"" + + " --assembly \"" + assemblyConfiguration.getName() + "\"" + + " --outdir \"" + outputBinding.getValue() + "\""; + try { // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); } catch (Exception e) { - logger.error("{}", e.getStackTrace()); -// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + logger.error("Error executing script {}: {}", params, e.getStackTrace()); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index e5ad0d8824..af1cc60123 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -59,14 +59,19 @@ public List downloadReferenceGenome() throws IOException, Interrup logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); Files.createDirectories(sequenceFolder); + List urls = new ArrayList<>(); + // Reference genome sequences are downloaded from Ensembl // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead DownloadFile downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PRIMARY_FA_FILE_ID, sequenceFolder); + urls.add(downloadFile.getUrl()); + + // Add files generated by scripts + urls.add(configuration.getDownload().getEnsembl().getUrl().getFiles().get(GENOME_INFO_FILE_ID)); // Save data source - saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - genomeVersionFilePath); + saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), urls, genomeVersionFilePath); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); From 733cade10bc10412b93093231a0ceeac6776d6d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Jul 2024 13:00:14 +0200 Subject: [PATCH 108/148] lib: improve genome builder by checking files, and fixing sonnar issues, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 76 +++++++++++++------ .../lib/builders/AbstractBuilder.java | 2 +- .../builders/GenomeSequenceFastaBuilder.java | 34 +++++---- .../lib/download/GenomeDownloadManager.java | 8 +- 4 files changed, 77 insertions(+), 43 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 05d5de191b..ef76205742 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -44,6 +44,10 @@ import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; public class BuildCommandExecutor extends CommandExecutor { @@ -61,10 +65,6 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; - private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, - MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, - ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); - public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -115,10 +115,8 @@ public void execute() throws CellBaseException { throw new CellBaseException("Download folder not found '" + spShortName + "_" + spAssembly + "/download'"); } buildFolder = outputDirectory.resolve(spFolder + "/generated_json"); - if (!buildFolder.toFile().exists()) { - if (!Files.exists(buildFolder)) { - Files.createDirectories(buildFolder); - } + if (!Files.exists(buildFolder)) { + Files.createDirectories(buildFolder); } // Check data sources @@ -170,9 +168,11 @@ public void execute() throws CellBaseException { + "Valid values are: " + StringUtils.join(speciesConfiguration.getData(), ",") + ". You can use data parameter 'all' to download everything"); } - - parser.parse(); - parser.disconnect(); + if (parser != null) { + parser.parse(); + parser.disconnect(); + logger.info(BUILDING_DONE_LOG_MESSAGE); + } } } catch (InterruptedException e) { // Restore interrupted state... @@ -184,16 +184,47 @@ public void execute() throws CellBaseException { } private AbstractBuilder buildGenomeSequence() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENOME_DATA)); + + Path genomeDownloadFolder = downloadFolder.resolve(GENOME_DATA); + Path genomeBuildFolder = buildFolder.resolve(GENOME_DATA); + + if (Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME)) + && Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME)) + && Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) { + logger.warn("{} data has been already built", getDataName(GENOME_DATA)); + return null; + } + // Sanity check - Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA)); - copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); + if (!Files.exists(genomeDownloadFolder.resolve(GENOME_INFO_FILENAME))) { + throw new CellBaseException("Genome info file " + GENOME_INFO_FILENAME + " does not exist at " + genomeDownloadFolder); + } - // Get FASTA path - Path fastaPath = getFastaReferenceGenome(); + // Copy files if necessary + if (!Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) { + Path genomeVersionPath = genomeDownloadFolder.resolve(getDataVersionFilename(GENOME_DATA)); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); + } + + if (!Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME))) { + try { + Files.copy(genomeDownloadFolder.resolve(GENOME_INFO_FILENAME), genomeBuildFolder.resolve(GENOME_INFO_FILENAME)); + } catch (IOException e) { + throw new CellBaseException("Error copying file " + GENOME_INFO_FILENAME, e); + } + } - // Create serializer and return the genome builder - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA); - return new GenomeSequenceFastaBuilder(fastaPath, serializer); + // Parse file + if (!Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME))) { + // Get FASTA path + Path fastaPath = getFastaReferenceGenome(); + + // Create serializer and return the genome builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(genomeBuildFolder, GENOME_DATA); + return new GenomeSequenceFastaBuilder(fastaPath, serializer); + } + return null; } private AbstractBuilder buildGene() throws CellBaseException { @@ -279,8 +310,8 @@ private AbstractBuilder buildConservation() throws CellBaseException { Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)), - conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), - conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); + conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); @@ -324,7 +355,8 @@ private Path getFastaReferenceGenome() throws CellBaseException { // Gunzip logger.info("Gunzip file: {}", fastaPath); try { - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null); + List params = Arrays.asList("--keep", fastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "gunzip", params, null); } catch (IOException e) { throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } catch (InterruptedException e) { @@ -387,7 +419,7 @@ private void checkVersionFiles(List versionPaths) throws CellBaseException } try { DataSource dataSource = dataSourceReader.readValue(versionPath.toFile()); - if (org.apache.commons.lang3.StringUtils.isEmpty(dataSource.getVersion())) { + if (StringUtils.isEmpty(dataSource.getVersion())) { throw new CellBaseException("Version missing version in file " + versionPath + ": a version must be specified in the" + " file"); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java index 85a04e2f8f..155b41cc5e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -52,7 +52,7 @@ public abstract class AbstractBuilder { public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!"; - public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; + public static final String BUILDING_LOG_MESSAGE = "Building {} data ..."; public static final String BUILDING_DONE_LOG_MESSAGE = "Building done."; public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java index 17f3472b20..e9395cceea 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java @@ -16,19 +16,25 @@ package org.opencb.cellbase.lib.builders; +import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.GenomeSequenceChunk; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.IOException; import java.nio.file.Path; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class GenomeSequenceFastaBuilder extends AbstractBuilder { private Path genomeReferenceFastaFile; private static final int CHUNK_SIZE = 2000; + public static final String GENOME_OUTPUT_FILENAME = EtlCommons.GENOME_DATA + ".json.gz"; public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSerializer serializer) { super(serializer); @@ -36,9 +42,10 @@ public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSeriali } @Override - public void parse() { + public void parse() throws CellBaseException { + logger.info(PARSING_LOG_MESSAGE, genomeReferenceFastaFile); - try { + try (BufferedReader br = FileUtils.newBufferedReader(genomeReferenceFastaFile)) { String sequenceName = null; String sequenceType = ""; String sequenceAssembly = null; @@ -46,8 +53,7 @@ public void parse() { StringBuilder sequenceStringBuilder = new StringBuilder(); // Preparing input and output files - BufferedReader br; - br = FileUtils.newBufferedReader(genomeReferenceFastaFile); + while ((line = br.readLine()) != null) { @@ -55,11 +61,9 @@ public void parse() { sequenceStringBuilder.append(line); } else { // new chromosome, save data - if (sequenceStringBuilder.length() > 0) { - if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) { - System.out.println(sequenceName); - serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); - } + if (sequenceStringBuilder.length() > 0 && StringUtils.isNotEmpty(sequenceName) && !sequenceName.contains("PATCH") + && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) { + serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); } // initialize data structures @@ -75,18 +79,17 @@ public void parse() { } } // Last chromosome must be processed - if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) { + if (StringUtils.isNotEmpty(sequenceName) && !sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") + && !sequenceName.contains("contig")) { serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); } - - br.close(); } catch (IOException e) { e.printStackTrace(); } + logger.info(PARSING_DONE_LOG_MESSAGE); } - private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence) - throws IOException { + private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence) { int chunk = 0; int start = 1; int end = CHUNK_SIZE - 1; @@ -100,11 +103,10 @@ private void serializeGenomeSequence(String chromosome, String sequenceType, Str genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + 0 + "_" + chunkIdSuffix, start, sequence.length() - 1, sequenceType, sequenceAssembly, chunkSequence); serializer.serialize(genomeSequenceChunk); - start += CHUNK_SIZE - 1; } else { while (start < sequence.length()) { if (chunk % 10000 == 0) { - System.out.println("Chr:" + chromosome + " chunkId:" + chunk); + logger.info("Chr: {}, chunkId: {}", chromosome, chunk); } // First chunk of the chromosome if (start == 1) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index af1cc60123..30cd8248ae 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -35,6 +35,8 @@ public class GenomeDownloadManager extends AbstractDownloadManager { private Path sequenceFolder; + public static final String GENOME_INFO_FILENAME = "genome_info.json"; + public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -79,10 +81,8 @@ public List downloadReferenceGenome() throws IOException, Interrup } public void downloadGenomeInfo() throws IOException, CellBaseException { - String genomeInfoFilename = "genome_info.json"; - // Already downloaded - if (isAlreadyDownloaded(sequenceFolder.resolve(genomeInfoFilename), getDataName(GENOME_INFO_DATA))) { + if (isAlreadyDownloaded(sequenceFolder.resolve(GENOME_INFO_FILENAME), getDataName(GENOME_INFO_DATA))) { return; } @@ -100,7 +100,7 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { String params = "/opt/cellbase/scripts/ensembl-scripts/genome_info.pl" + " --species \"" + speciesConfiguration.getScientificName() + "\"" + " --assembly \"" + assemblyConfiguration.getName() + "\"" - + " --outfile \"" + outputBinding.getValue() + "/" + genomeInfoFilename + "\""; + + " --outfile \"" + outputBinding.getValue() + "/" + GENOME_INFO_FILENAME + "\""; // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); From ddc10569d3ffeae5d8bdccf01604a164a3c146ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Jul 2024 13:21:56 +0200 Subject: [PATCH 109/148] lib: take into account the parameter --keep when gunzip, #TASK-5576, #TASK-5564 --- .../cli/admin/executors/BuildCommandExecutor.java | 14 +++++++------- .../java/org/opencb/cellbase/lib/EtlCommons.java | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index ef76205742..4d324836b0 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -350,22 +350,22 @@ private Path getFastaReferenceGenome() throws CellBaseException { String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, SpeciesUtils.getSpeciesShortname(speciesConfiguration), assembly.getName(), null); String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); - Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); - if (fastaPath.toFile().exists()) { + Path gzFastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); + Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, "")); + if (!fastaPath.toFile().exists()) { // Gunzip - logger.info("Gunzip file: {}", fastaPath); + logger.info("Gunzip file: {}", gzFastaPath); try { - List params = Arrays.asList("--keep", fastaPath.toString()); + List params = Arrays.asList("--keep", gzFastaPath.toString()); EtlCommons.runCommandLineProcess(null, "gunzip", params, null); } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); } catch (InterruptedException e) { // Restore interrupted state... Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); } } - fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, "")); if (!fastaPath.toFile().exists()) { throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 0c95580205..1ffebe30b0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -523,12 +523,12 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat process.waitFor(); // Check process output - if (process.exitValue() != 0) { - String msg = "Error executing command '" + binPath + "'; args = " + args + ", error code = " + process.exitValue() - + ". More info in log file: " + logFilePath; - logger.error(msg); - throw new CellBaseException(msg); - } +// if (process.exitValue() != 0) { +// String msg = "Error executing command '" + binPath + "'; args = " + args + ", error code = " + process.exitValue() +// + ". More info in log file: " + logFilePath; +// logger.error(msg); +// throw new CellBaseException(msg); +// } return true; } From 8c6dc78ca199e1d37922c0b00f7d581f3d387bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Jul 2024 10:06:49 +0200 Subject: [PATCH 110/148] lib: improve conservation builder by adding checks, log messages and fixing sonnar issues, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 14 +++++++-- .../org/opencb/cellbase/lib/EtlCommons.java | 12 ++----- .../lib/builders/ConservationBuilder.java | 31 +++++++++++-------- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 4d324836b0..22ec5971d6 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -306,12 +306,20 @@ private AbstractBuilder buildProtein() throws CellBaseException { } private AbstractBuilder buildConservation() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + // Sanity check Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); - copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)), - conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), - conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); + + // Check and copy version files + List dataList = Arrays.asList(GERP_DATA, PHASTCONS_DATA, PHYLOP_DATA); + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(conservationDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_DATA).resolve(getDataVersionFilename(GERP_DATA)), + conservationDownloadPath.resolve(PHASTCONS_DATA).resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(PHYLOP_DATA).resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 1ffebe30b0..460987c6e0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -17,8 +17,6 @@ package org.opencb.cellbase.lib; import org.apache.commons.lang3.StringUtils; -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.core.config.Configurator; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.download.DownloadFile; @@ -348,6 +346,8 @@ public final class EtlCommons { private static Map dataCategoriesMap = new HashMap<>(); private static Map dataVersionFilenamesMap = new HashMap<>(); + private static final Logger LOGGER = LoggerFactory.getLogger(EtlCommons.class); + static { // Populate data names map @@ -510,15 +510,9 @@ private EtlCommons() { public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException, CellBaseException { - Configurator.setRootLevel(Level.INFO); - - Logger logger = LoggerFactory.getLogger("EtlCommons"); - ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - if (logger.isDebugEnabled()) { - logger.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); - } + LOGGER.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 9f2ae630f9..aadcdb6caf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -56,8 +56,6 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile @Override public void parse() throws IOException, CellBaseException { - logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); - if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + " be read"); @@ -65,17 +63,17 @@ public void parse() throws IOException, CellBaseException { // Check GERP folder and files Path gerpPath = conservedRegionPath.resolve(GERP_DATA); - DataSource dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); + DataSource dataSource = dataSourceReader.readValue(gerpPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); List gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA)); // Check PhastCons folder and files Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA); - dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); + dataSource = dataSourceReader.readValue(phastConsPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); List phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA)); // Check PhyloP folder and files Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA); - dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); + dataSource = dataSourceReader.readValue(phylopPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); List phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA)); // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse @@ -137,8 +135,6 @@ public void parse() throws IOException, CellBaseException { logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA); } - - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { @@ -271,11 +267,10 @@ private void storeScores(int startOfBatch, String chromosome, List conser conservationScores.clear(); } - private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { + private void processWigFixFile(Path inGzPath, String conservationSource) { logger.info(PARSING_LOG_MESSAGE, inGzPath); + String line = null; try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { - - String line; String chromosome = ""; int start = 0; float value; @@ -322,7 +317,12 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws values.clear(); } - value = Float.parseFloat(line.trim()); + try { + value = Float.parseFloat(line.trim()); + } catch (NumberFormatException e) { + value = 0; + logger.warn("Invalid value: {}. Stack trace: {}", line, e.getStackTrace()); + } values.add(value); } } @@ -330,6 +330,8 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws // Write last conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } catch (Exception e) { + logger.error("ERROR parsing {}. Line: {}. Stack trace: {}", inGzPath, line, e.getStackTrace()); } logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath); } @@ -339,8 +341,11 @@ private String getOutputFileName(String chromosome) { if (chromosome.equals("M")) { chromosome = "MT"; } - String outputFileName = outputFileNames.get(chromosome); - if (outputFileName == null) { + + String outputFileName; + if (outputFileNames.containsKey(chromosome)) { + outputFileName = outputFileNames.get(chromosome); + } else { outputFileName = getFilename(CONSERVATION_DATA, chromosome); outputFileNames.put(chromosome, outputFileName); } From 847f8359a09ff48f5f9c32ca223bfb7d6a7f0690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Jul 2024 15:48:52 +0200 Subject: [PATCH 111/148] lib: add support for multi-species, checks and log messages in the repeats builder, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 32 ++++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 49 +++++++++++-- .../lib/builders/AbstractBuilder.java | 9 +++ .../cellbase/lib/builders/RepeatsBuilder.java | 72 ++++++++++++------- .../lib/builders/RepeatsBuilderTest.java | 10 +-- 5 files changed, 128 insertions(+), 44 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 22ec5971d6..96dd7f16e4 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -39,6 +39,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -47,6 +48,7 @@ import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -233,16 +235,32 @@ private AbstractBuilder buildGene() throws CellBaseException { } private AbstractBuilder buildRepeats() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + // Sanity check Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); - List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(getDataVersionFilename(TRF_DATA)), - repeatsDownloadPath.resolve(getDataVersionFilename(GSD_DATA)), - repeatsDownloadPath.resolve(getDataVersionFilename(WM_DATA))); - copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_DATA)); + Path repeatsBuildPath = buildFolder.resolve(REPEATS_DATA); + List dataList = EtlCommons.getDataList(REPEATS_DATA, configuration, speciesConfiguration); + List filesToCheck = new ArrayList<>(); + filesToCheck.add(repeatsBuildPath.resolve(REPEATS_OUTPUT_FILENAME)); + for (String data : dataList) { + filesToCheck.add(repeatsBuildPath.resolve(getDataVersionFilename(data))); + } + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn("{} data has been already built", getDataName(REPEATS_DATA)); + return null; + } + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(repeatsDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + for (String data : dataList) { + copyVersionFiles(Collections.singletonList(repeatsDownloadPath.resolve(data).resolve(getDataVersionFilename(data))), + repeatsBuildPath); + } // Create serializer and return the repeats builder - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_DATA), REPEATS_BASENAME); - return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_DATA), REPEATS_DATA); + return new RepeatsBuilder(dataList, repeatsDownloadPath, serializer, configuration); } private AbstractBuilder buildObo() throws CellBaseException { @@ -311,8 +329,6 @@ private AbstractBuilder buildConservation() throws CellBaseException { // Sanity check Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); - - // Check and copy version files List dataList = Arrays.asList(GERP_DATA, PHASTCONS_DATA, PHYLOP_DATA); for (String data : dataList) { checkVersionFiles(Collections.singletonList(conservationDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 460987c6e0..055aa67162 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -17,7 +17,9 @@ package org.opencb.cellbase.lib; import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.commons.utils.FileUtils; @@ -31,10 +33,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; /** @@ -212,7 +211,6 @@ public final class EtlCommons { // Repeats public static final String REPEATS_DATA = "repeats"; - public static final String REPEATS_BASENAME = "repeats"; /** * @deprecated (when refactoring downloaders, builders and loaders) */ @@ -708,4 +706,45 @@ public static String getManualUrl(String host, String file) { } return null; } + + public static List getDataList(String data, CellBaseConfiguration configuration, SpeciesConfiguration speciesConfiguration) + throws CellBaseException { + switch (data) { + case REPEATS_DATA: { + return getRepeatsDataList(configuration, speciesConfiguration); + } + default: { + throw new CellBaseException("Unknown data " + data); + } + } + } + + private static List getRepeatsDataList(CellBaseConfiguration configuration, SpeciesConfiguration speciesConfiguration) { + List dataList = new ArrayList<>(); + String speciesId = speciesConfiguration.getId().toUpperCase(Locale.ROOT); + if (speciesId.equalsIgnoreCase(HSAPIENS_NAME)) { + return Arrays.asList(TRF_DATA, WM_DATA, GSD_DATA); + } + + if (isDataSupported(configuration.getDownload().getSimpleRepeats(), speciesId)) { + dataList.add(TRF_DATA); + } + if (isDataSupported(configuration.getDownload().getWindowMasker(), speciesId)) { + dataList.add(WM_DATA); + } + if (isDataSupported(configuration.getDownload().getGenomicSuperDups(), speciesId)) { + dataList.add(GSD_DATA); + } + return dataList; + } + + private static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) { + for (String key : props.getFiles().keySet()) { + if (key.startsWith(prefix)) { + return true; + } + } + return false; + } + } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java index 155b41cc5e..aa10b2644a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -195,4 +195,13 @@ protected Path getIndexFastaReferenceGenome(Path fastaPath) throws CellBaseExcep } return indexFastaPath; } + + public static boolean existFiles(List paths) { + for (Path path : paths) { + if (!Files.exists(path)) { + return false; + } + } + return true; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index 041c52f522..ce55659f65 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -30,6 +30,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -40,10 +41,14 @@ public class RepeatsBuilder extends AbstractBuilder { private CellBaseConfiguration configuration; + private List dataList; private final Path filesDir; - public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { + public static final String REPEATS_OUTPUT_FILENAME = EtlCommons.REPEATS_DATA + ".json.gz"; + + public RepeatsBuilder(List dataList, Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); + this.dataList = dataList; this.filesDir = filesDir; this.configuration = configuration; } @@ -51,51 +56,60 @@ public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBase @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, getDataName(REPEATS_DATA)); - // Sanity check checkDirectory(filesDir, getDataName(REPEATS_DATA)); // Check Simple Repeats (TRF) filename - String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() - .toString(); - if (!Files.exists(filesDir.resolve(trfFilename))) { - throw new CellBaseException(getMessageMissingFile(TRF_DATA, trfFilename, filesDir)); + String trfFilename = null; + if (dataList.contains(TRF_DATA)) { + trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(trfFilename))) { + throw new CellBaseException(getMessageMissingFile(TRF_DATA, trfFilename, filesDir)); + } } // Check Genomic Super Duplications (GSD) file - String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) - .getFileName().toString(); - if (!Files.exists(filesDir.resolve(gsdFilename))) { - throw new CellBaseException(getMessageMissingFile(GSD_DATA, gsdFilename, filesDir)); + String gsdFilename = null; + if (dataList.contains(GSD_DATA)) { + gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) + .getFileName().toString(); + if (!Files.exists(filesDir.resolve(gsdFilename))) { + throw new CellBaseException(getMessageMissingFile(GSD_DATA, gsdFilename, filesDir)); + } } // Check Window Masker (WM) file - String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() - .toString(); - if (!Files.exists(filesDir.resolve(wmFilename))) { - throw new CellBaseException(getMessageMissingFile(WM_DATA, wmFilename, filesDir)); + String wmFilename = null; + if (dataList.contains(WM_DATA)) { + wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(wmFilename))) { + throw new CellBaseException(getMessageMissingFile(WM_DATA, wmFilename, filesDir)); + } } // Parse TRF file - logger.info(BUILDING_LOG_MESSAGE, getDataName(TRF_DATA)); - parseTrfFile(filesDir.resolve(trfFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(TRF_DATA)); + if (dataList.contains(TRF_DATA)) { + logger.info(PARSING_LOG_MESSAGE, getDataName(TRF_DATA)); + parseTrfFile(filesDir.resolve(trfFilename)); + } // Parse GSD file - logger.info(BUILDING_LOG_MESSAGE, getDataName(GSD_DATA)); - parseGsdFile(filesDir.resolve(gsdFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GSD_DATA)); + if (dataList.contains(GSD_DATA)) { + logger.info(PARSING_LOG_MESSAGE, getDataName(GSD_DATA)); + parseGsdFile(filesDir.resolve(gsdFilename)); + } // Parse WM file - logger.info(BUILDING_LOG_MESSAGE, getDataName(WM_DATA)); - parseWmFile(filesDir.resolve(wmFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(WM_DATA)); - - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); + if (dataList.contains(WM_DATA)) { + logger.info(PARSING_LOG_MESSAGE, getDataName(WM_DATA)); + parseWmFile(filesDir.resolve(wmFilename)); + } } private void parseTrfFile(Path filePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, filePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); @@ -107,6 +121,7 @@ private void parseTrfFile(Path filePath) throws IOException, CellBaseException { progressLogger.increment(1); } } + logger.info(PARSING_DONE_LOG_MESSAGE); } private Repeat parseTrfLine(String line) { @@ -118,6 +133,7 @@ private Repeat parseTrfLine(String line) { } private void parseGsdFile(Path filePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, filePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); @@ -129,6 +145,7 @@ private void parseGsdFile(Path filePath) throws IOException, CellBaseException { progressLogger.increment(1); } } + logger.info(PARSING_DONE_LOG_MESSAGE); } private Repeat parseGSDLine(String line) { @@ -141,6 +158,7 @@ private Repeat parseGSDLine(String line) { } private void parseWmFile(Path filePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, filePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); @@ -152,6 +170,7 @@ private void parseWmFile(Path filePath) throws IOException, CellBaseException { progressLogger.increment(1); } } + logger.info(PARSING_DONE_LOG_MESSAGE); } private Repeat parseWmLine(String line) { @@ -168,6 +187,5 @@ private String getMessageMissingFile(String data, String filename, Path folder) private String getMessageParsedLines(String data) throws CellBaseException { return "Parsed " + getDataName(data) + " lines:"; } - } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java index 8e27bf3f98..6a98066f92 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java @@ -16,8 +16,8 @@ package org.opencb.cellbase.lib.builders; -import org.junit.jupiter.api.Test; import org.eclipse.jetty.util.ajax.JSON; +import org.junit.jupiter.api.Test; import org.opencb.biodata.models.variant.avro.Repeat; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; @@ -26,13 +26,15 @@ import java.io.BufferedReader; import java.io.IOException; -import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.Arrays; import java.util.HashSet; import java.util.Set; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.opencb.cellbase.lib.EtlCommons.*; /** @@ -51,7 +53,7 @@ public void testParse() throws Exception { CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getClassLoader().getResourceAsStream("configuration.test.yaml")); Path repeatsFilesDir = Paths.get(getClass().getResource("/repeats").getPath()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "repeats.test"); - (new RepeatsBuilder(repeatsFilesDir, serializer, configuration)).parse(); + (new RepeatsBuilder(Arrays.asList(WM_DATA, GSD_DATA, TRF_DATA), repeatsFilesDir, serializer, configuration)).parse(); serializer.close(); Set expected = loadRepeatSet(Paths.get(getClass().getClassLoader().getResource("repeats/repeats.test.json.gz").getPath())); Set current = loadRepeatSet(Paths.get("/tmp/repeats.test.json.gz")); From b0d1c67767ed4292446380c3187432c5d70ccc74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Jul 2024 17:05:53 +0200 Subject: [PATCH 112/148] lib: add support for multi-species, checks and log messages in regulation builder, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 26 ++++++++++++---- .../org/opencb/cellbase/lib/EtlCommons.java | 2 -- .../lib/builders/AbstractBuilder.java | 4 +-- .../builders/RegulatoryFeatureBuilder.java | 30 ++++++++++++------- 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 96dd7f16e4..273974ff23 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -48,6 +48,7 @@ import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -67,6 +68,8 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; + private static final String DATA_ALREADY_BUILT = "{} data has already been built."; + public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -194,7 +197,7 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException { if (Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME)) && Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME)) && Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) { - logger.warn("{} data has been already built", getDataName(GENOME_DATA)); + logger.warn(DATA_ALREADY_BUILT, getDataName(GENOME_DATA)); return null; } @@ -241,13 +244,12 @@ private AbstractBuilder buildRepeats() throws CellBaseException { Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); Path repeatsBuildPath = buildFolder.resolve(REPEATS_DATA); List dataList = EtlCommons.getDataList(REPEATS_DATA, configuration, speciesConfiguration); - List filesToCheck = new ArrayList<>(); - filesToCheck.add(repeatsBuildPath.resolve(REPEATS_OUTPUT_FILENAME)); + List filesToCheck = new ArrayList<>(Arrays.asList(repeatsBuildPath.resolve(REPEATS_OUTPUT_FILENAME))); for (String data : dataList) { filesToCheck.add(repeatsBuildPath.resolve(getDataVersionFilename(data))); } if (AbstractBuilder.existFiles(filesToCheck)) { - logger.warn("{} data has been already built", getDataName(REPEATS_DATA)); + logger.warn(DATA_ALREADY_BUILT, getDataName(REPEATS_DATA)); return null; } for (String data : dataList) { @@ -300,11 +302,23 @@ private AbstractBuilder buildRevel() throws CellBaseException { } private AbstractBuilder buildRegulation() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA)); + // Sanity check Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); - copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)), - regulationDownloadPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))), regulationBuildPath); + List filesToCheck = Arrays.asList(regulationBuildPath.resolve(REGULATORY_REGION_OUTPUT_FILENAME), + regulationBuildPath.resolve(REGULATORY_PFM_OUTPUT_FILENAME), + regulationBuildPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)), + regulationBuildPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(REGULATION_DATA)); + return null; + } + + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_DATA).resolve(getDataVersionFilename( + REGULATORY_BUILD_DATA)), regulationDownloadPath.resolve(MOTIF_FEATURES_DATA).resolve(getDataVersionFilename( + MOTIF_FEATURES_DATA))), regulationBuildPath); // Create the file serializer and the regulatory feature builder CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 055aa67162..11af71249b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -263,8 +263,6 @@ public final class EtlCommons { // Regulation public static final String REGULATION_DATA = "regulation"; - public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; - public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) public static final String REGULATORY_BUILD_DATA = "regulatory_build"; // Motif features (see Ensembl files) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java index aa10b2644a..8359f26e8d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -61,7 +61,7 @@ public abstract class AbstractBuilder { public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done."; - public AbstractBuilder(CellBaseSerializer serializer) { + protected AbstractBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); this.serializer = serializer; @@ -75,7 +75,7 @@ public void disconnect() { try { serializer.close(); } catch (Exception e) { - logger.error("Error closing serializer:\n" + StringUtils.join(e.getStackTrace(), "\n")); + logger.error("Error closing serializer. Stack trace: {}", e.getStackTrace()); } } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index 752290e147..280fc631bb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -45,9 +45,13 @@ public class RegulatoryFeatureBuilder extends AbstractBuilder { private Path regulationPath; - private Set regulatoryFeatureSet; + public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; + public static final String REGULATORY_REGION_OUTPUT_FILENAME = REGULATORY_REGION_BASENAME + ".json.gz"; + public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; + public static final String REGULATORY_PFM_OUTPUT_FILENAME = REGULATORY_PFM_BASENAME + ".json.gz"; + public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializer) { super(serializer); this.regulationPath = regulationPath; @@ -55,14 +59,17 @@ public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializ @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA)); - // Sanity check checkDirectory(regulationPath, getDataName(REGULATION_DATA)); + DataSource dataSource; + List regulatoryFiles; + List motifFeaturesFiles; + // Check build regulatory files - DataSource dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile()); - List regulatoryFiles = checkFiles(dataSource, regulationPath, getDataCategory(REGULATORY_BUILD_DATA) + "/" + dataSource = dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_DATA) + .resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile()); + regulatoryFiles = checkFiles(dataSource, regulationPath.resolve(REGULATORY_BUILD_DATA), getDataCategory(REGULATORY_BUILD_DATA) + "/" + getDataName(REGULATORY_BUILD_DATA)); if (regulatoryFiles.size() != 1) { throw new CellBaseException("One " + getDataName(REGULATORY_BUILD_DATA) + " file is expected, but currently there are " @@ -70,8 +77,9 @@ public void parse() throws Exception { } // Check motif features files - dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile()); - List motifFeaturesFiles = checkFiles(dataSource, regulationPath, getDataCategory(MOTIF_FEATURES_DATA) + "/" + dataSource = dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_DATA) + .resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile()); + motifFeaturesFiles = checkFiles(dataSource, regulationPath.resolve(MOTIF_FEATURES_DATA), getDataCategory(MOTIF_FEATURES_DATA) + "/" + getDataName(MOTIF_FEATURES_DATA)); if (motifFeaturesFiles.size() != 2) { throw new CellBaseException("Two " + getDataName(MOTIF_FEATURES_DATA) + " files are expected, but currently there are " @@ -84,8 +92,6 @@ public void parse() throws Exception { // Parse regulatory build features parseGffFile(regulatoryFiles.get(0).toPath()); - - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { @@ -110,12 +116,12 @@ protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSu } serializer.close(); - logger.info(PARSING_DONE_LOG_MESSAGE, regulatoryFeatureFile); + logger.info(PARSING_DONE_LOG_MESSAGE); } private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { - Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_BASENAME + ".json.gz"); + Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_OUTPUT_FILENAME); logger.info("Downloading and building PFM matrices in {} from {} ...", regulatoryPfmPath, motifGffFile); if (Files.exists(regulatoryPfmPath)) { logger.info("{} is already built", regulatoryPfmPath); @@ -123,6 +129,7 @@ private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOExcep } Set motifIds = new HashSet<>(); + logger.info(PARSING_LOG_MESSAGE, motifGffFile); try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { Gff2 tfbsMotifFeature; Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); @@ -133,6 +140,7 @@ private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOExcep } } } + logger.info(PARSING_DONE_LOG_MESSAGE); ObjectMapper mapper = new ObjectMapper(); CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, REGULATORY_PFM_BASENAME, true); From 039aa81f96887763ebd1cfcaea4280e5fbb09a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Jul 2024 17:07:49 +0200 Subject: [PATCH 113/148] lib: fix protein builder, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 18 ++++- .../cellbase/lib/builders/ProteinBuilder.java | 67 ++++++++++++------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 273974ff23..129b31e78d 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -48,6 +48,7 @@ import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.ProteinBuilder.OUTPUT_PROTEIN_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -326,11 +327,24 @@ private AbstractBuilder buildRegulation() throws CellBaseException { } private AbstractBuilder buildProtein() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); + // Sanity check Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); - copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(getDataVersionFilename(UNIPROT_DATA)), - proteinDownloadPath.resolve(getDataVersionFilename(INTERPRO_DATA))), proteinBuildPath); + List filesToCheck = Arrays.asList(proteinBuildPath.resolve(OUTPUT_PROTEIN_OUTPUT_FILENAME), + proteinBuildPath.resolve(getDataVersionFilename(INTERPRO_DATA)), + proteinBuildPath.resolve(getDataVersionFilename(INTACT_DATA)), + proteinBuildPath.resolve(getDataVersionFilename(UNIPROT_DATA))); + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(PROTEIN_DATA)); + return null; + } + + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(INTERPRO_DATA).resolve(getDataVersionFilename( + INTERPRO_DATA)), proteinDownloadPath.resolve(INTACT_DATA).resolve(getDataVersionFilename( + INTACT_DATA)), proteinDownloadPath.resolve(UNIPROT_DATA).resolve(getDataVersionFilename( + UNIPROT_DATA))), proteinBuildPath); // Create the file serializer and the protein builder CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 4beef32a99..1407d02239 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -29,8 +29,6 @@ import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.xml.bind.JAXBException; import java.io.BufferedReader; @@ -41,6 +39,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -49,7 +48,7 @@ public class ProteinBuilder extends AbstractBuilder { private Path proteinPath; private String species; - protected Logger logger = LoggerFactory.getLogger(this.getClass()); + public static final String OUTPUT_PROTEIN_OUTPUT_FILENAME = PROTEIN_DATA + ".json.gz"; public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer serializer) { super(serializer); @@ -60,32 +59,31 @@ public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer seria @Override public void parse() throws CellBaseException, IOException { - logger.info(BUILDING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); - // Sanity check checkDirectory(proteinPath, getDataName(PROTEIN_DATA)); // Check UniProt file - DataSource dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(UNIPROT_DATA)).toFile()); - List uniProtFiles = checkFiles(dataSource, proteinPath, getDataCategory(UNIPROT_DATA) + "/" + getDataName(UNIPROT_DATA)); + DataSource dataSource = dataSourceReader.readValue(proteinPath.resolve(UNIPROT_DATA).resolve(getDataVersionFilename(UNIPROT_DATA)) + .toFile()); + List uniProtFiles = checkFiles(dataSource, proteinPath.resolve(UNIPROT_DATA), getDataCategory(UNIPROT_DATA) + "/" + + getDataName(UNIPROT_DATA)); if (uniProtFiles.size() != 1) { - throw new CellBaseException("Only one " + getDataName(UNIPROT_DATA) + " file is expected, but currently there are " - + uniProtFiles.size() + " files"); + throw new CellBaseException(getMismatchNumFilesErrorMessage(getDataName(UNIPROT_DATA), uniProtFiles.size())); } // Check InterPro file - dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(INTERPRO_DATA)).toFile()); - List interProFiles = checkFiles(dataSource, proteinPath, getDataCategory(INTERPRO_DATA) + "/" + getDataName(INTERPRO_DATA)); + dataSource = dataSourceReader.readValue(proteinPath.resolve(INTERPRO_DATA).resolve(getDataVersionFilename(INTERPRO_DATA)).toFile()); + List interProFiles = checkFiles(dataSource, proteinPath.resolve(INTERPRO_DATA), getDataCategory(INTERPRO_DATA) + "/" + + getDataName(INTERPRO_DATA)); if (interProFiles.size() != 1) { - throw new CellBaseException("Only one " + getDataName(INTERPRO_DATA) + " file is expected, but currently there are " - + interProFiles.size() + " files"); + throw new CellBaseException(getMismatchNumFilesErrorMessage(getDataName(INTERPRO_DATA), interProFiles.size())); } // Prepare UniProt data by splitting data in chunks Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); logger.info("Split {} file {} into chunks at {}", getDataName(UNIPROT_DATA), uniProtFiles.get(0).getName(), uniProtChunksPath); Files.createDirectories(uniProtChunksPath); - splitUniprot(proteinPath.resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); + splitUniprot(proteinPath.resolve(UNIPROT_DATA).resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); // Prepare RocksDB RocksDB rocksDb = getDBConnection(uniProtChunksPath); @@ -99,6 +97,7 @@ public void parse() throws CellBaseException, IOException { try { File[] files = uniProtChunksPath.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); + for (File file : files) { logger.info(PARSING_LOG_MESSAGE, file); Uniprot uniprot = (Uniprot) UniProtParser.loadXMLInfo(file.toString(), UniProtParser.UNIPROT_CONTEXT); @@ -108,18 +107,24 @@ public void parse() throws CellBaseException, IOException { for (OrganismNameType organismNameType : entry.getOrganism().getName()) { entryOrganism = organismNameType.getValue(); if (entryOrganism.equals(species)) { + proteinMap.put(entry.getAccession().get(0), entry); + + // Update RocksDB rocksDb.put(entry.getAccession().get(0).getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); } } } - logger.info(PARSING_DONE_LOG_MESSAGE, file); + logger.info(PARSING_DONE_LOG_MESSAGE); + } + logger.info("Number of proteins stored in map: '{}'", proteinMap.size()); + if (proteinMap.size() > 10) { + logger.info("First 10 protein IDs in map: {}", proteinMap.keySet().stream().collect(Collectors.toList()).subList(0, 10)); } - logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); logger.info(PARSING_LOG_MESSAGE, interProFiles.get(0)); try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interProFiles.get(0).toPath())) { - Set hashSet = new HashSet<>(proteinMap.keySet()); - Set visited = new HashSet<>(30000); + Set hashSet = proteinMap.keySet(); + Set visited = new HashSet<>(proteinMap.size()); int numInterProLinesProcessed = 0; int numUniqueProteinsProcessed = 0; @@ -141,8 +146,6 @@ public void parse() throws CellBaseException, IOException { && featureType.getLocation().getEnd().getPosition() != null && featureType.getLocation().getBegin().getPosition().equals(start) && featureType.getLocation().getEnd().getPosition().equals(end)) { - featureType.setId(fields[1]); - featureType.setRef(fields[3]); iprAdded = true; break; } @@ -166,24 +169,38 @@ public void parse() throws CellBaseException, IOException { bytes = rocksDb.get(fields[0].getBytes()); entry = mapper.readValue(bytes, Entry.class); entry.getFeature().add(featureType); + + if (fields[0].equalsIgnoreCase(entry.getAccession().get(0))) { + // Update RocksDB + rocksDb.put(fields[0].getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); + } else { + logger.info("Something wrong happen: interpro fields[0] = {} vs entry.getAccession().get(0) = {}", + fields[0], entry.getAccession().get(0)); + } } if (!visited.contains(fields[0])) { visited.add(fields[0]); numUniqueProteinsProcessed++; } + } else { + logger.info("{} not found in protein map", fields[0]); } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, - getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); + logger.info("{} {} lines processed", numInterProLinesProcessed, getDataName(INTERPRO_DATA)); + logger.info("{} {} unique proteins processed", getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); } } - logger.info(PARSING_DONE_LOG_MESSAGE, interProFiles.get(0)); + logger.info("{} {} lines processed", numInterProLinesProcessed, getDataName(INTERPRO_DATA)); + logger.info("{} {} unique proteins processed", getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); + + logger.info(PARSING_DONE_LOG_MESSAGE); } catch (IOException e) { throw new CellBaseException("Error parsing " + getDataName(INTERPRO_DATA) + " file: " + interProFiles.get(0), e); } + // Serialize and save results RocksIterator rocksIterator = rocksDb.newIterator(); for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { @@ -258,4 +275,8 @@ private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOE } } } + + private String getMismatchNumFilesErrorMessage(String dataName, int numFiles) { + return "Only one " + dataName + " file is expected, but currently there are " + numFiles + " files"; + } } From 7f77dec9a23889438780f4ac2936a91831479b2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Jul 2024 17:44:59 +0200 Subject: [PATCH 114/148] lib: fix gene downloader for RefSeq files, #TASK-5575, #TASK-5564 --- .../org/opencb/cellbase/lib/download/GeneDownloadManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index c091ed11e5..5af9f01097 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -152,7 +152,7 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { // GTF, DNA, RNA, Protein String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID) + if (configuration.getDownload().getRefSeq().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID) && !isAlreadyDownloaded(refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA)), getDataName(REFSEQ_DATA))) { logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); From 0eb898e776206bbe075569c41220ca6ea49b1351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 31 Jul 2024 09:59:49 +0200 Subject: [PATCH 115/148] lib: improve gene (Ensembl/RefSeq) builder by supporting multi-species (e.g., mmusculus), #TASK-6426, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 76 ++++-- .../org/opencb/cellbase/lib/EtlCommons.java | 34 ++- .../lib/builders/AbstractBuilder.java | 7 +- .../lib/builders/EnsemblGeneBuilder.java | 153 +++++++----- .../builders/EnsemblGeneBuilderIndexer.java | 26 ++- .../cellbase/lib/builders/GeneBuilder.java | 66 +++++- .../lib/builders/GeneBuilderIndexer.java | 81 ++++--- .../lib/builders/RefSeqGeneBuilder.java | 217 +++++++++--------- .../builders/RefSeqGeneBuilderIndexer.java | 6 +- 9 files changed, 414 insertions(+), 252 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 129b31e78d..aff9c5cc89 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -47,8 +47,10 @@ import static org.opencb.cellbase.lib.EtlCommons.*; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.ProteinBuilder.OUTPUT_PROTEIN_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -69,8 +71,6 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; - private static final String DATA_ALREADY_BUILT = "{} data has already been built."; - public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -234,8 +234,49 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException { } private AbstractBuilder buildGene() throws CellBaseException { - return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing, - configuration); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); + + // Sanity check + Path geneDownloadPath = downloadFolder.resolve(GENE_DATA); + Path geneBuildPath = buildFolder.resolve(GENE_DATA); + + List versionFiles = new ArrayList<>(Arrays.asList( + geneDownloadPath.resolve(ENSEMBL_DATA).resolve(getDataVersionFilename(ENSEMBL_DATA)), + geneDownloadPath.resolve(REFSEQ_DATA).resolve(getDataVersionFilename(REFSEQ_DATA)))); + List dataList = GeneBuilder.getCommonDataSources(speciesConfiguration, configuration); + for (String data : dataList) { + Path versionFile; + switch (data) { + case MIRTARBASE_DATA: + versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRTARBASE_DATA).resolve(getDataVersionFilename(data)); + break; + case MIRBASE_DATA: + versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRBASE_DATA).resolve(getDataVersionFilename(data)); + break; + default: + versionFile = downloadFolder.resolve(GERP_DATA).resolve(getDataVersionFilename(data)); + break; + } + versionFiles.add(versionFile); + } + + List filesToCheck = new ArrayList<>(Arrays.asList(geneBuildPath.resolve(ENSEMBL_GENE_OUTPUT_FILENAME), + geneBuildPath.resolve(REFSEQ_GENE_OUTPUT_FILENAME))); + for (Path versionFile : versionFiles) { + filesToCheck.add(geneBuildPath.resolve(versionFile.getFileName())); + } + filesToCheck.addAll(versionFiles); + + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(ENSEMBL_DATA) + " and " + getDataName(REFSEQ_DATA) + " genes"); + return null; + } + + System.exit(-1); + + copyVersionFiles(versionFiles, geneBuildPath); + + return new GeneBuilder(geneDownloadPath, geneBuildPath, speciesConfiguration, flexibleGTFParsing, configuration); } private AbstractBuilder buildRepeats() throws CellBaseException { @@ -403,25 +444,8 @@ private Path getFastaReferenceGenome() throws CellBaseException { SpeciesUtils.getSpeciesShortname(speciesConfiguration), assembly.getName(), null); String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); Path gzFastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); - Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, "")); - if (!fastaPath.toFile().exists()) { - // Gunzip - logger.info("Gunzip file: {}", gzFastaPath); - try { - List params = Arrays.asList("--keep", gzFastaPath.toString()); - EtlCommons.runCommandLineProcess(null, "gunzip", params, null); - } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); - } catch (InterruptedException e) { - // Restore interrupted state... - Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); - } - } - if (!fastaPath.toFile().exists()) { - throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); - } - return fastaPath; + + return EtlCommons.getFastaPath(gzFastaPath); } private AbstractBuilder buildSplice() throws IOException, CellBaseException { @@ -484,7 +508,11 @@ private void checkVersionFiles(List versionPaths) throws CellBaseException private void copyVersionFiles(List versionPaths, Path targetPath) throws CellBaseException { // Check version files before copying them checkVersionFiles(versionPaths); - if (!targetPath.toFile().exists()) { + copyFiles(versionPaths, targetPath); + } + + private void copyFiles(List versionPaths, Path targetPath) throws CellBaseException { + if (!Files.exists(targetPath)) { try { Files.createDirectories(targetPath); } catch (IOException e) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 11af71249b..e94acaf4bf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -93,13 +93,11 @@ public final class EtlCommons { // Gene public static final String GENE_DATA = "gene"; - public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; public static final String GENE_ANNOTATION_DATA = "gene_annotation"; public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; // RefSeq public static final String REFSEQ_DATA = "refseq"; - public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; // Must match the configuration file public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; @@ -508,7 +506,7 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - LOGGER.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); + LOGGER.info("Executing command: {}", StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); @@ -541,6 +539,34 @@ private static ProcessBuilder getProcessBuilder(File workingDirectory, String bi return builder; } + public static Path getFastaPath(Path gzFastaPath) throws CellBaseException { + // Sanity check + if (!Files.exists(gzFastaPath)) { + throw new CellBaseException("Gzipped FASTA file " + gzFastaPath + " does not exist"); + } + + // Check FASTA and unzip if necessary + Path fastaPath = gzFastaPath.getParent().resolve(gzFastaPath.getFileName().toString().replace(GZ_EXTENSION, "")); + if (!fastaPath.toFile().exists()) { + // Gunzip + LOGGER.info("Gunzip file {}", gzFastaPath); + try { + List params = Arrays.asList("--keep", gzFastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "gunzip", params, null); + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); + } + } + if (!fastaPath.toFile().exists()) { + throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); + } + return fastaPath; + } + public static boolean isMissing(String string) { return !((string != null) && !string.isEmpty() && !string.replace(" ", "") @@ -736,7 +762,7 @@ private static List getRepeatsDataList(CellBaseConfiguration configurati return dataList; } - private static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) { + public static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) { for (String key : props.getFiles().keySet()) { if (key.startsWith(prefix)) { return true; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java index 8359f26e8d..550197c762 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -61,6 +61,9 @@ public abstract class AbstractBuilder { public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done."; + public static final String SKIPPING_INDEX_DATA_LOG_MESSAGE = "Skipping index for data '{}': it is not supported for species '{}'."; + public static final String DATA_ALREADY_BUILT = "'{}' data has already been built."; + protected AbstractBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); @@ -80,7 +83,7 @@ public void disconnect() { } } - protected String getConfigurationFileIdPrefix(String scientificSpecies) { + protected static String getConfigurationFileIdPrefix(String scientificSpecies) { String prefix = ""; if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) { char c = scientificSpecies.charAt(0); @@ -94,6 +97,8 @@ protected File checkFile(DownloadProperties.URLProperties props, String fileId, String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); if (filename.contains(MANUAL_PREFIX)) { filename = filename.replace(MANUAL_PREFIX, ""); + } else if (filename.contains(SCRIPT_PREFIX)) { + filename = filename.split("@")[1]; } Path filePath = targetPath.resolve(filename); if (!Files.exists(filePath)) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index e29cba82b1..044d9bc232 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -31,7 +31,6 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDBException; import java.io.File; @@ -53,31 +52,28 @@ public class EnsemblGeneBuilder extends AbstractBuilder { private final Map transcriptDict; private final Map exonDict; - private Path gtfFile; - private Path proteinFastaFile; - private Path cDnaFastaFile; - private Path geneDescriptionFile; - private Path xrefsFile; - private Path hgncFile; - private Path maneFile; - private Path lrgFile; - private Path uniprotIdMappingFile; - private Path tfbsFile; - private Path tabixFile; - private Path geneExpressionFile; - private Path geneDrugFile; - private Path hpoFile; - private Path disgenetFile; - private Path genomeSequenceFilePath; - private Path gnomadFile; - private Path geneOntologyAnnotationFile; - private Path miRBaseFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHostpotFile; - private Path ensemblCanonicalFile; -// private Path tso500File; -// private Path eglhHaemOncFile; + private Path gtfFile = null; + private Path proteinFastaFile = null; + private Path cDnaFastaFile = null; + private Path geneDescriptionFile = null; + private Path xrefsFile = null; + private Path hgncFile = null; + private Path maneFile = null; + private Path lrgFile = null; + private Path uniprotIdMappingFile = null; + private Path tfbsFile = null; + private Path tabixFile = null; + private Path geneExpressionFile = null; + private Path geneDrugFile = null; + private Path hpoFile = null; + private Path genomeSequenceFilePath = null; + private Path gnomadFile = null; + private Path geneOntologyAnnotationFile = null; + private Path miRBaseFile = null; + private Path miRTarBaseFile = null; + private Path cancerGeneCensusFile = null; + private Path cancerHostpotFile = null; + private Path ensemblCanonicalFile = null; // source for genes is either ensembl or refseq private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); @@ -92,6 +88,11 @@ public class EnsemblGeneBuilder extends AbstractBuilder { private String feature; private Gtf nextGtfToReturn; + private boolean isHSapiens = false; + + public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; + public static final String ENSEMBL_GENE_OUTPUT_FILENAME = ENSEMBL_GENE_BASENAME + ".json.gz"; + public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); @@ -103,6 +104,10 @@ public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfigu transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); + + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + isHSapiens = true; + } } public void check() throws Exception { @@ -134,27 +139,68 @@ public void check() throws Exception { xrefsFile = checkFile(props, ENSEMBL_XREFS_FILE_ID, downloadPath.getParent(), "Ensembl Xrefs").toPath(); ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath(); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { -// tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); -// eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + // Check common files + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MANE_SELECT_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(LRG_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHgnc(), prefixId)) { hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(HGNC_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(DGIDB_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGeneUniprotXref(), prefixId)) { uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGeneExpressionAtlas(), prefixId)) { geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); - disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(HPO_DISEASE_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGnomadConstraints(), prefixId)) { gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGoAnnotation(), prefixId)) { geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(CANCER_GENE_CENSUS_DATA), speciesConfiguration.getScientificName()); } // Check regulation files // Motif features - List files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), - 2); + List files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MOTIF_FEATURES_DATA), 2); if (files.get(0).getName().endsWith("tbi")) { tabixFile = files.get(0).toPath(); tfbsFile = files.get(1).toPath(); @@ -162,36 +208,28 @@ public void check() throws Exception { tabixFile = files.get(1).toPath(); tfbsFile = files.get(0).toPath(); } + // mirbase - miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getMirbase(), prefixId)) { + miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MIRTARBASE_DATA), speciesConfiguration.getScientificName()); + } // mirtarbase - miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRTARBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MIRTARBASE_DATA), speciesConfiguration.getScientificName()); + } // Check genome FASTA file Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA); String genomeGzFilename = Paths.get(((DataSource) dataSourceReader.readValue(genomeDownloadPath .resolve(getDataVersionFilename(GENOME_DATA)).toFile())).getUrls().get(0)).getFileName().toString(); - genomeSequenceFilePath = genomeDownloadPath.resolve(genomeGzFilename); - if (Files.exists(genomeSequenceFilePath)) { - // Need to be gunzip-ed - logger.info("Gunzip file: {}", genomeSequenceFilePath); - try { - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(genomeSequenceFilePath.toString()), null); - } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e); - } catch (InterruptedException e) { - // Restore interrupted state... - Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e); - } - } - String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, ""); - genomeSequenceFilePath = genomeDownloadPath.resolve(genomeFilename); - if (!Files.exists(genomeSequenceFilePath)) { - throw new CellBaseException("Genome FASTA file " + genomeSequenceFilePath.getFileName() + " does not exist at " - + genomeSequenceFilePath.getParent()); - } + genomeSequenceFilePath = getFastaPath(genomeDownloadPath.resolve(genomeGzFilename)); logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); checked = true; @@ -209,10 +247,9 @@ public void parse() throws Exception { EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); try { // process files and put values in rocksdb - indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, - proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, - geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile); + indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, proteinFastaFile, + cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, geneDrugFile, hpoFile, gnomadFile, + geneOntologyAnnotationFile, miRBaseFile, miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile); TabixReader tabixReader = null; if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index d46ebef225..4841f5ffe2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -71,8 +71,8 @@ public EnsemblGeneBuilderIndexer(Path geneDirectoryPath) { public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, Path lrgFile, Path uniprotIdMappingFile, Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, - Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, - Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile) + Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, + Path cancerHostpotFile, Path canonicalFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { indexDescriptions(geneDescriptionFile); indexXrefs(xrefsFile, uniprotIdMappingFile); @@ -83,7 +83,7 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexCdnaSequences(cDnaFastaFile); indexExpression(species, geneExpressionFile); indexDrugs(geneDrugFile); - indexDiseases(hpoFile, disgenetFile); + indexDiseases(hpoFile); indexConstraints(gnomadFile); indexOntologyAnnotations(geneOntologyAnnotationFile); indexMiRBase(species, miRBaseFile); @@ -91,8 +91,6 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); indexCanonical(canonicalFile); -// indexTSO500(tso500File); -// indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { @@ -202,6 +200,10 @@ public List getXrefs(String id) throws RocksDBException, IOException { } private void indexExpression(String species, Path geneExpressionFile) throws IOException, RocksDBException { + if (geneExpressionFile == null) { + return; + } + Map> geneExpressionMap = new HashMap<>(); if (geneExpressionFile != null && Files.exists(geneExpressionFile) && Files.size(geneExpressionFile) > 0 && species != null) { @@ -253,7 +255,11 @@ public List getExpression(String id) throws RocksDBException, IOExce } private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { - if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { + if (gnomadFile == null) { + return; + } + + if (Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { logger.info("Loading OE scores from '{}'", gnomadFile); InputStream inputStream = Files.newInputStream(gnomadFile); BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(inputStream))); @@ -309,6 +315,10 @@ private void addConstraint(List constraints, String name, String val } private void indexOntologyAnnotations(Path goaFile) throws IOException, RocksDBException { + if (goaFile == null) { + return; + } + Map> annotations = new HashMap<>(); if (goaFile != null && Files.exists(goaFile) && Files.size(goaFile) > 0) { logger.info("Loading GO annotation from '{}'", goaFile); @@ -329,6 +339,10 @@ public List getOntologyAnnotations(String id) thr } private void indexMiRBase(String species, Path miRBaseFile) throws IOException { + if (miRBaseFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, miRBaseFile); MirBaseCallback callback = new MirBaseCallback(rocksdb, rocksDbManager); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index a5dda27e34..44b7e587fc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -21,12 +21,19 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_BASENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_BASENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; public class GeneBuilder extends AbstractBuilder { + private Path downloadPath; private EnsemblGeneBuilder ensemblGeneBuilder; private RefSeqGeneBuilder refSeqGeneBuilder; @@ -34,15 +41,15 @@ public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speci CellBaseConfiguration configuration) throws CellBaseException { super(null); + this.downloadPath = downloadPath; + // Create Ensembl gene builder - CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA), - ENSEMBL_GENE_BASENAME); + CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath, ENSEMBL_GENE_BASENAME); this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, configuration, ensemblGeneSerializer); // Create RefSeq gene builder - CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA), - REFSEQ_GENE_BASENAME); + CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath, REFSEQ_GENE_BASENAME); this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, configuration, refSeqGeneSerializer); } @@ -57,15 +64,58 @@ public void check() throws Exception { @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); - // Check folders and files before building check(); -// // Build Ensembl/RefSeq genes + // Build Ensembl genes ensemblGeneBuilder.parse(); - refSeqGeneBuilder.parse(); + + // Build RefSeq genes + if (!Files.exists(downloadPath.resolve(REFSEQ_DATA).resolve(REFSEQ_GENE_OUTPUT_FILENAME))) { + refSeqGeneBuilder.parse(); + } else { + logger.info(DATA_ALREADY_BUILT, getDataName(REFSEQ_DATA) + " gene"); + } + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); } + + public static List getCommonDataSources(SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration) { + List dataList = new ArrayList<>(); + + boolean isHSapiens = false; + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + isHSapiens = true; + } + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { + dataList.add(MANE_SELECT_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { + dataList.add(LRG_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + dataList.add(CANCER_HOTSPOT_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { + dataList.add(DGIDB_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { + dataList.add(HPO_DISEASE_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + dataList.add(CANCER_GENE_CENSUS_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + dataList.add(MIRTARBASE_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getMirbase(), prefixId)) { + dataList.add(MIRBASE_DATA); + } + + return dataList; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 1f56afe564..7b980ffd72 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -44,8 +44,8 @@ import java.util.*; import java.util.stream.Collectors; -import static org.opencb.cellbase.lib.EtlCommons.DISGENET_DATA; import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.EtlCommons.HPO_DISEASE_DATA; import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; @@ -69,8 +69,6 @@ public class GeneBuilderIndexer { protected final String DRUGS_SUFFIX = "_drug"; protected final String DISEASE_SUFFIX = "_disease"; protected final String MIRTARBASE_SUFFIX = "_mirtarbase"; -// protected final String TSO500_SUFFIX = "_tso500"; -// protected final String EGLH_HAEMONC_SUFFIX = "_eglh_haemonc"; public GeneBuilderIndexer(Path genePath) { this.init(genePath); @@ -101,6 +99,10 @@ public String getCdnaFasta(String id) throws RocksDBException { } protected void indexProteinSequences(Path proteinFastaFile) throws IOException, FileFormatException, RocksDBException { + if (proteinFastaFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); FastaReader fastaReader = new FastaReader(proteinFastaFile); Fasta fasta; @@ -116,6 +118,10 @@ protected String getProteinFasta(String id) throws RocksDBException { } protected void indexHgncIdMapping(Path hgncMappingFile) throws IOException, RocksDBException { + if (hgncMappingFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, hgncMappingFile); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { String line = bufferedReader.readLine(); @@ -135,6 +141,10 @@ public String getHgncId(String id) throws RocksDBException { } protected void indexManeMapping(Path maneMappingFile, String referenceId) throws IOException, RocksDBException { + if (maneMappingFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, maneMappingFile); int idColumn = referenceId.equalsIgnoreCase(ENSEMBL_DATA) ? 7 : 5; @@ -161,6 +171,10 @@ public String getMane(String id, String field) throws RocksDBException { } protected void indexLrgMapping(Path lrgMappingFile, String referenceId) throws IOException, RocksDBException { + if (lrgMappingFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, lrgMappingFile); // # Last modified: 30-03-2021@22:00:06 @@ -189,6 +203,10 @@ public String getLrg(String id, String field) throws RocksDBException { } protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBException { + if (cgcFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, cgcFile); Map tissuesMap = new HashMap<>(); @@ -313,6 +331,10 @@ public List getCancerGeneCensus(String geneName) throws R } public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBException { + if (cancerHotspot == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, cancerHotspot); // Store all cancer hotspot (different gene and aminoacid position) for each gene in the same key @@ -497,6 +519,10 @@ protected void close() throws IOException { } protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { + if (geneDrugFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, geneDrugFile); String currentGene = ""; @@ -561,49 +587,32 @@ protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBExceptio logger.info(PARSING_DONE_LOG_MESSAGE, geneDrugFile); } - protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { + protected void indexDiseases(Path hpoFilePath) throws IOException, RocksDBException { + if (hpoFilePath == null) { + return; + } Map> geneDiseaseAssociationMap = new HashMap<>(50000); String line; // HPO -// logger.info(PARSING_LOG_MESSAGE, hpoFilePath); -// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { -// // Skip first header line -// bufferedReader.readLine(); -// while ((line = bufferedReader.readLine()) != null) { -// String[] fields = line.split("\t"); -// String omimId = fields[6]; -// String geneSymbol = fields[3]; -// String hpoId = fields[0]; -// String diseaseName = fields[1]; -// GeneTraitAssociation disease = -// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DATA); -// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); -// } -// } -// logger.info(PARSING_DONE_LOG_MESSAGE, hpoFilePath); - - // DisGeNet - logger.info(PARSING_LOG_MESSAGE, disgenetFilePath); - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { + logger.info(PARSING_LOG_MESSAGE, hpoFilePath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { // Skip first header line bufferedReader.readLine(); while ((line = bufferedReader.readLine()) != null) { String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), DISGENET_DATA); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); + String omimId = fields[6]; + String geneSymbol = fields[3]; + String hpoId = fields[0]; + String diseaseName = fields[1]; + GeneTraitAssociation disease = + new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DISEASE_DATA); + addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); } } - logger.info(PARSING_DONE_LOG_MESSAGE, disgenetFilePath); + logger.info(PARSING_DONE_LOG_MESSAGE); for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); @@ -611,6 +620,10 @@ protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOE } protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { + if (miRTarBaseFile == null) { + return; + } + MiRTarBaseIndexer miRTarBaseIndexer = new MiRTarBaseIndexer(); Map> result = miRTarBaseIndexer.index(miRTarBaseFile); for (Map.Entry> entry : result.entrySet()) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 3248e2f5d4..b470b2cb21 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -45,20 +45,17 @@ public class RefSeqGeneBuilder extends AbstractBuilder { private Map transcriptDict; private Map exonDict; - private Path gtfFile; - private Path fastaFile; - private Path proteinFastaFile; - private Path cdnaFastaFile; - private Path maneFile; - private Path lrgFile; - private Path disgenetFile; - private Path hpoFile; - private Path geneDrugFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHotspot; -// private Path tso500File; -// private Path eglhHaemOncFile; + private Path gtfFile = null; + private Path fastaFile = null; + private Path proteinFastaFile = null; + private Path cdnaFastaFile = null; + private Path maneFile = null; + private Path lrgFile = null; + private Path hpoFile = null; + private Path geneDrugFile = null; + private Path miRTarBaseFile = null; + private Path cancerGeneCensusFile = null; + private Path cancerHotspot = null; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); private static final String KNOWN_STATUS = "KNOWN"; @@ -70,6 +67,15 @@ public class RefSeqGeneBuilder extends AbstractBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; + private boolean isHSapiens = false; + + private static final String ENSEMBL = "ensembl"; + private static final String TRANSCRIPT_ID = "transcript_id"; + private static final String EXON_NUMBER = "exon_number"; + + public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; + public static final String REFSEQ_GENE_OUTPUT_FILENAME = REFSEQ_GENE_BASENAME + ".json.gz"; + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); @@ -80,6 +86,10 @@ public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfigur transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); + + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + isHSapiens = true; + } } public void check() throws Exception { @@ -109,43 +119,49 @@ public void check() throws Exception { // Check genome FASTA file String genomeGzFilename = Paths.get(props.getFiles().get(prefixId + REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString(); - fastaFile = downloadPath.resolve(genomeGzFilename); - if (Files.exists(fastaFile)) { - // Need to be gunzip-ed - logger.info("Gunzip file: {}", fastaFile); - try { - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaFile.toString()), null); - } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e); - } catch (InterruptedException e) { - // Restore interrupted state... - Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e); - } - } - String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, ""); - fastaFile = downloadPath.resolve(genomeFilename); - if (!Files.exists(fastaFile)) { - throw new CellBaseException("Genome FASTA file " + fastaFile.getFileName() + " does not exist at " + fastaFile.getParent()); - } + Path fastaGzFile = downloadPath.resolve(genomeGzFilename); + fastaFile = EtlCommons.getFastaPath(fastaGzFile); // Check common files - props = configuration.getDownload().getEnsembl().getUrl(); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { -// tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); -// eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, MANE_SELECT_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, LRG_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, CANCER_HOTSPOT_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, DGIDB_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); - disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, HPO_DISEASE_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, CANCER_GENE_CENSUS_DATA, speciesConfiguration.getScientificName()); } // Check regulation files // mirtarbase - miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRTARBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, MIRTARBASE_DATA, speciesConfiguration.getScientificName()); + } logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); checked = true; @@ -163,41 +179,41 @@ public void parse() throws Exception { // Index protein sequences for later logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); - indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, - cancerGeneCensusFile, cancerHotspot); + indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, miRTarBaseFile, cancerGeneCensusFile, + cancerHotspot); logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); logger.info(PARSING_LOG_MESSAGE, gtfFile); - GtfReader gtfReader = new GtfReader(gtfFile); - - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - String chromosome = getSequenceName(gtf.getSequenceName()); - switch (gtf.getFeature()) { - case "gene": - // we've finished the previous transcript, store xrefs - addXrefs(transcript, geneDbxrefs, exonDbxrefs); - parseGene(gtf, chromosome, indexer); - break; - case "transcript": - break; - case "exon": - parseExon(gtf, chromosome, fastaIndex, indexer); - break; - case "CDS": - parseCDS(gtf, indexer); - break; - case "start_codon": - seenStopCodon = false; - break; - case "stop_codon": - if (!seenStopCodon) { - parseStopCodon(gtf); - seenStopCodon = true; - } - break; - default: - throw new RuntimeException("Unexpected feature type: " + gtf.getFeature()); + try (GtfReader gtfReader = new GtfReader(gtfFile)) { + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + String chromosome = getSequenceName(gtf.getSequenceName()); + switch (gtf.getFeature()) { + case "gene": + // we've finished the previous transcript, store xrefs + addXrefs(transcript, geneDbxrefs, exonDbxrefs); + parseGene(gtf, chromosome, indexer); + break; + case "transcript": + break; + case "exon": + parseExon(gtf, chromosome, fastaIndex, indexer); + break; + case "CDS": + parseCDS(gtf, indexer); + break; + case "start_codon": + seenStopCodon = false; + break; + case "stop_codon": + if (!seenStopCodon) { + parseStopCodon(gtf); + seenStopCodon = true; + } + break; + default: + throw new CellBaseException("Error parsing: unexpected feature type: " + gtf.getFeature()); + } } } @@ -208,7 +224,6 @@ public void parse() throws Exception { store(); // Close - gtfReader.close(); serializer.close(); if (fastaIndex != null) { fastaIndex.close(); @@ -239,7 +254,6 @@ private void addXrefs(Transcript transcript, Set geneDbxrefs, Set ex return; } exonDbxrefs.addAll(geneDbxrefs); -// transcript.setXrefs(new ArrayList<>(exonDbxrefs)); transcript.getXrefs().addAll(exonDbxrefs); transcript.getXrefs().add(new Xref(transcript.getName(), "hgnc_symbol", "HGNC Symbol")); @@ -278,8 +292,9 @@ private void parseGene(Gtf gtf, String chromosome, RefSeqGeneBuilderIndexer inde geneDbxrefs = parseXrefs(gtf); } - private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeqGeneBuilderIndexer indexer) throws RocksDBException { - String transcriptId = gtf.getAttributes().get("transcript_id"); + private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeqGeneBuilderIndexer indexer) throws RocksDBException, + CellBaseException { + String transcriptId = gtf.getAttributes().get(TRANSCRIPT_ID); // new transcript if (!transcriptDict.containsKey(transcriptId)) { @@ -303,7 +318,7 @@ private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeq if (fastaIndex != null) { exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); } - String exonNumber = gtf.getAttributes().get("exon_number"); + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); // RefSeq does not provide Exon IDs, we are using transcript ID and exon numbers String exonId = transcriptId + "_" + exonNumber; @@ -325,14 +340,14 @@ private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeq } } - private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBException { - String exonNumber = gtf.getAttributes().get("exon_number"); + private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBException, CellBaseException { + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); if (StringUtils.isEmpty(exonNumber)) { // this CDS doesn't know which exon it belongs to. skip return; } - transcript = transcriptDict.get(gtf.getAttributes().get("transcript_id")); + transcript = transcriptDict.get(gtf.getAttributes().get(TRANSCRIPT_ID)); String exonId = transcript.getId() + "_" + exonNumber; Exon exon = exonDict.get(exonId); @@ -458,12 +473,12 @@ private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBE } private void parseStopCodon(Gtf gtf) { - String exonNumber = gtf.getAttributes().get("exon_number"); + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); if (StringUtils.isEmpty(exonNumber)) { // some codons don't have an exon number, discard return; } - Transcript transcript = transcriptDict.get(gtf.getAttributes().get("transcript_id")); + transcript = transcriptDict.get(gtf.getAttributes().get(TRANSCRIPT_ID)); String exonId = transcript.getId() + "_" + exonNumber; Exon exon = exonDict.get(exonId); @@ -564,14 +579,14 @@ private void parseStopCodon(Gtf gtf) { } } - private Set parseXrefs(Gtf gtf) { + private Set parseXrefs(Gtf gtf) throws CellBaseException { Set xrefSet = new HashSet<>(); String xrefs = gtf.getAttributes().get("db_xref"); if (StringUtils.isNotEmpty(xrefs)) { for (String xrefString : xrefs.split(",")) { String[] dbxrefParts = xrefString.split(":", 2); if (dbxrefParts.length != 2) { - throw new RuntimeException("Bad xref, expected colon: " + xrefString); + throw new CellBaseException("Error parsing Xrefs: bad xref, expected colon: " + xrefString); } String dbName = dbxrefParts[0].toLowerCase(); String id = dbxrefParts[1]; @@ -580,7 +595,7 @@ private Set parseXrefs(Gtf gtf) { dbName = "hgnc_id"; dbDisplayName = "HGNC ID"; } - if ("ensembl".equalsIgnoreCase(dbName)) { + if (ENSEMBL.equalsIgnoreCase(dbName)) { if (id.startsWith("ENST")) { dbName = "ensembl_transcript"; dbDisplayName = "Ensembl transcript"; @@ -601,7 +616,6 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId Map gtfAttributes = gtf.getAttributes(); String name = gene.getName(); -// String biotype = gtfAttributes.get("gbkey"); String biotype = gtfAttributes.get("transcript_biotype"); if ("mRNA".equals(biotype)) { biotype = "protein_coding"; @@ -612,7 +626,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new HashSet<>(), new TranscriptAnnotation()); // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("ensembl", "ensembl_protein")) { + for (String suffix: Arrays.asList(ENSEMBL, "ensembl_protein")) { String maneRefSeq = indexer.getMane(transcriptId, suffix); if (StringUtils.isNotEmpty(maneRefSeq)) { transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, @@ -621,7 +635,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId } // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptId, "ensembl"); + String lrgRefSeq = indexer.getLrg(transcriptId, ENSEMBL); if (StringUtils.isNotEmpty(lrgRefSeq)) { transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_ensembl", "LRG Ensembl")); } @@ -637,15 +651,6 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId if (StringUtils.isNotEmpty(lrg)) { transcript.getFlags().add("LRG"); } - // 3. TSO500 and EGLH HaemOnc -// String tso500Flag = indexer.getTSO500(transcriptId.split("\\.")[0]); -// if (StringUtils.isNotEmpty(tso500Flag)) { -// transcript.getFlags().add(tso500Flag); -// } -// String eglhHaemOncFlag = indexer.getEGLHHaemOnc(transcriptId.split("\\.")[0]); -// if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { -// transcript.getFlags().add(eglhHaemOncFlag); -// } gene.getTranscripts().add(transcript); @@ -654,7 +659,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId } private String getGeneId(Gtf gtf) throws CellBaseException { - // db_xref "GeneID:100287102"; + // Splitting the db_xref, e.g.: "GeneID:100287102" String xrefString = gtf.getAttributes().get("db_xref"); String[] xrefs = xrefString.split(","); for (String xref : xrefs) { @@ -667,11 +672,11 @@ private String getGeneId(Gtf gtf) throws CellBaseException { throw new CellBaseException("Didn't find geneId for db_xref:" + xrefString); } - private String getSequenceName(String fullSequenceName) { + private String getSequenceName(String fullSequenceName) throws CellBaseException { String[] sequenceNameParts = fullSequenceName.split("\\."); if (sequenceNameParts.length != 2) { - throw new RuntimeException("bad chromosome: " + fullSequenceName); + throw new CellBaseException("Invalid sequence name: bad chromosome: " + fullSequenceName); } // just get the first part, e.g. NC_000024.11 @@ -683,20 +688,6 @@ private String getSequenceName(String fullSequenceName) { return fullSequenceName; } -// private void setAnnotationFiles(Path refSeqDirectoryPath) { -// Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); -// maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); -// lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); -// geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); -// disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); -// hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); -// cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); -// cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); -// tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); -// eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); -// miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); -// } - static { REFSEQ_CHROMOSOMES.put("NC_000001", "1"); REFSEQ_CHROMOSOMES.put("NC_000002", "2"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 596c8b61c9..6a4fe69fc9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -32,18 +32,16 @@ public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { } public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, - Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot) + Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot) throws IOException, RocksDBException, FileFormatException, CellBaseException { indexManeMapping(maneFile, REFSEQ_DATA); indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexDrugs(geneDrugFile); - indexDiseases(hpoFilePath, disgenetFile); + indexDiseases(hpoFilePath); indexMiRTarBase(miRTarBaseFile); indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); -// indexTSO500(tso500File); -// indexEGLHHaemOnc(eglhHaemOncFile); } } From 1d47fd9b9e5b7c815e391fac4edc1fad1b6e9761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 31 Jul 2024 10:21:22 +0200 Subject: [PATCH 116/148] lib: fix sonnar issues, #TASK-5576, #TASK-5564 --- .../cellbase/lib/builders/GeneBuilder.java | 3 +- .../lib/builders/GeneBuilderIndexer.java | 154 ++++++------------ 2 files changed, 47 insertions(+), 110 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index 44b7e587fc..b850f9b40a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -18,7 +18,6 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import java.nio.file.Files; @@ -38,7 +37,7 @@ public class GeneBuilder extends AbstractBuilder { private RefSeqGeneBuilder refSeqGeneBuilder; public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseConfiguration configuration) throws CellBaseException { + CellBaseConfiguration configuration) { super(null); this.downloadPath = downloadPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 7b980ffd72..8db1ab315f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -59,16 +59,16 @@ public class GeneBuilderIndexer { protected String dbLocation; protected Options dbOption; - protected final String HGNC_ID_SUFFIX = "_hgncid"; - protected final String MANE_SUFFIX = "_mane"; - protected final String LRG_SUFFIX = "_lrg"; - protected final String CANCER_GENE_CENSUS_SUFFIX = "_cgc"; - protected final String CANCER_HOTSPOT_SUFFIX = "_chs"; - protected final String PROTEIN_SEQUENCE_SUFFIX = "_protein_fasta"; - protected final String CDNA_SEQUENCE_SUFFIX = "_cdna_fasta"; - protected final String DRUGS_SUFFIX = "_drug"; - protected final String DISEASE_SUFFIX = "_disease"; - protected final String MIRTARBASE_SUFFIX = "_mirtarbase"; + protected static final String HGNC_ID_SUFFIX = "_hgncid"; + protected static final String MANE_SUFFIX = "_mane"; + protected static final String LRG_SUFFIX = "_lrg"; + protected static final String CANCER_GENE_CENSUS_SUFFIX = "_cgc"; + protected static final String CANCER_HOTSPOT_SUFFIX = "_chs"; + protected static final String PROTEIN_SEQUENCE_SUFFIX = "_protein_fasta"; + protected static final String CDNA_SEQUENCE_SUFFIX = "_cdna_fasta"; + protected static final String DRUGS_SUFFIX = "_drug"; + protected static final String DISEASE_SUFFIX = "_disease"; + protected static final String MIRTARBASE_SUFFIX = "_mirtarbase"; public GeneBuilderIndexer(Path genePath) { this.init(genePath); @@ -85,12 +85,12 @@ private void init(Path genePath) { protected void indexCdnaSequences(Path cDnaFastaFile) throws IOException, FileFormatException, RocksDBException { logger.info(PARSING_LOG_MESSAGE, cDnaFastaFile); - FastaReader fastaReader = new FastaReader(cDnaFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); + try (FastaReader fastaReader = new FastaReader(cDnaFastaFile)) { + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); + } } - fastaReader.close(); logger.info(PARSING_DONE_LOG_MESSAGE, cDnaFastaFile); } @@ -104,12 +104,12 @@ protected void indexProteinSequences(Path proteinFastaFile) throws IOException, } logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); - FastaReader fastaReader = new FastaReader(proteinFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); + try (FastaReader fastaReader = new FastaReader(proteinFastaFile)) { + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); + } } - fastaReader.close(); logger.info(PARSING_DONE_LOG_MESSAGE, proteinFastaFile); } @@ -236,17 +236,17 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx try (BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile)) { // Skip the first header line - bufferedReader.readLine(); + String line = bufferedReader.readLine(); GeneCancerAssociation cancerGeneAssociation; - String line; + while ((line = bufferedReader.readLine()) != null) { String[] fields = line.split("\t", -1); // Find Ensembl Gene Id in the last comma-separated column List synonyms = StringUtils.isNotEmpty(fields[19]) ? Arrays.stream(fields[19] - .replaceAll("\"", "") - .replaceAll(" ", "") + .replace("\"", "") + .replace(" ", "") .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -262,53 +262,54 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx boolean somatic = StringUtils.isNotEmpty(fields[7]) && fields[7].equalsIgnoreCase("yes"); boolean germline = StringUtils.isNotEmpty(fields[8]) && fields[8].equalsIgnoreCase("yes"); List somaticTumourTypes = StringUtils.isNotEmpty(fields[9]) - ? Arrays.asList(fields[9].replaceAll("\"", "").split(", ")) + ? Arrays.asList(fields[9].replace("\"", "").split(", ")) : new ArrayList<>(); List germlineTumourTypes = StringUtils.isNotEmpty(fields[10]) - ? Arrays.asList(fields[10].replaceAll("\"", "").split(", ")) + ? Arrays.asList(fields[10].replace("\"", "").split(", ")) : Collections.emptyList(); List syndromes = StringUtils.isNotEmpty(fields[11]) - ? Arrays.asList(fields[11].replaceAll("\"", "").split("; ")) + ? Arrays.asList(fields[11].replace("\"", "").split("; ")) : Collections.emptyList(); List tissues = StringUtils.isNotEmpty(fields[12]) ? Arrays.stream(fields[12] - .replaceAll("\"", "") - .replaceAll(" ", "") + .replace("\"", "") + .replace(" ", "") .split(",")) .map(tissuesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); - List modeOfInheritance = StringUtils.isNotEmpty(fields[13]) - ? fields[13].equalsIgnoreCase("Dom/Rec") - ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) - : Collections.singletonList(moiMap.get(fields[13])) - : Collections.emptyList(); + List modeOfInheritance = Collections.emptyList(); + if (StringUtils.isNotEmpty(fields[13])) { + modeOfInheritance = fields[13].equalsIgnoreCase("Dom/Rec") + ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) + : Collections.singletonList(moiMap.get(fields[13])); + } List roleInCancer = StringUtils.isNotEmpty(fields[14]) ? Arrays.stream(fields[14] - .replaceAll("\"", "") - .replaceAll(" ", "") + .replace("\"", "") + .replace(" ", "") .split(",")) .map(roleInCancerMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List mutationTypes = StringUtils.isNotEmpty(fields[15]) ? Arrays.stream(fields[15] - .replaceAll("\"", "") - .replaceAll(" ", "") + .replace("\"", "") + .replace(" ", "") .split(",")) .map(mutationTypesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List translocationPartners = StringUtils.isNotEmpty(fields[16]) ? Arrays.stream(fields[16] - .replaceAll("\"", "") - .replaceAll(" ", "") + .replace("\"", "") + .replace(" ", "") .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); List otherSyndromes = StringUtils.isNotEmpty(fields[18]) ? Arrays.stream(fields[18] - .replaceAll("\"", "") + .replace("\"", "") .split("; ")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -426,8 +427,8 @@ public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBEx } } - for (String geneName : visited.keySet()) { - rocksDbManager.update(rocksdb, geneName + CANCER_HOTSPOT_SUFFIX, visited.get(geneName)); + for (Map.Entry> entry : visited.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + CANCER_HOTSPOT_SUFFIX, entry.getValue()); } logger.info(PARSING_DONE_LOG_MESSAGE, cancerHotspot); @@ -438,66 +439,6 @@ public List getCancerHotspot(String geneName) throws RocksDBExcep return rocksDbManager.getCancerHotspot(rocksdb, key); } -// protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { -// logger.info(PARSING_LOG_MESSAGE, tso500Path); -// -// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { -// String line = bufferedReader.readLine(); -// // Gene Ref Seq -// // FAS NM_000043 -// // AR NM_000044 -// while (StringUtils.isNotEmpty(line)) { -// if (!line.startsWith("#")) { -// String[] fields = line.split("\t", -1); -// if (fields.length == 2) { -// rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); -// } -// } -// line = bufferedReader.readLine(); -// } -// } -// logger.info(PARSING_DONE_LOG_MESSAGE, tso500Path); -// } -// -// public String getTSO500(String transcriptId) throws RocksDBException { -// String key = transcriptId + TSO500_SUFFIX; -// byte[] bytes = rocksdb.get(key.getBytes()); -// if (bytes == null) { -// return null; -// } -// return new String(bytes); -// } - -// protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { -// logger.info(PARSING_LOG_MESSAGE, eglhHaemOncPath); -// -// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { -// String line = bufferedReader.readLine(); -// // Gene Ref Seq -// // GNB1 NM_002074.4 -// // CSF3R NM_000760.3 -// while (StringUtils.isNotEmpty(line)) { -// if (!line.startsWith("#")) { -// String[] fields = line.split("\t", -1); -// if (fields.length == 2) { -// rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); -// } -// } -// line = bufferedReader.readLine(); -// } -// } -// logger.info(PARSING_DONE_LOG_MESSAGE, eglhHaemOncPath); -// } -// -// public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { -// String key = transcriptId + EGLH_HAEMONC_SUFFIX; -// byte[] bytes = rocksdb.get(key.getBytes()); -// if (bytes == null) { -// return null; -// } -// return new String(bytes); -// } - private String getIndexEntry(String id, String suffix) throws RocksDBException { return getIndexEntry(id, suffix, ""); } @@ -530,10 +471,8 @@ protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBExceptio try (BufferedReader br = FileUtils.newBufferedReader(geneDrugFile)) { // Skip header - br.readLine(); + String line = br.readLine(); - int lineCounter = 1; - String line; while ((line = br.readLine()) != null) { String[] parts = line.split("\t"); String geneName = parts[0]; @@ -578,7 +517,6 @@ protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBExceptio GeneDrugInteraction drug = new GeneDrugInteraction( geneName, drugName, source, null, null, interactionType, chemblId, publications); drugs.add(drug); - lineCounter++; } } // update last gene @@ -600,7 +538,7 @@ protected void indexDiseases(Path hpoFilePath) throws IOException, RocksDBExcept logger.info(PARSING_LOG_MESSAGE, hpoFilePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { // Skip first header line - bufferedReader.readLine(); + line = bufferedReader.readLine(); while ((line = bufferedReader.readLine()) != null) { String[] fields = line.split("\t"); String omimId = fields[6]; From 7fbc0547aaf99d13ba57416e7ae5acf6fa3b6832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 31 Jul 2024 11:03:51 +0200 Subject: [PATCH 117/148] lib: add variant and variant_structural_variations in the configuration file for species (e.g., mmusculus), and update the variant downloader according to these changes, #TASK-6426, #TASK-5564 --- .../src/main/resources/configuration.yml | 2 ++ .../org/opencb/cellbase/lib/EtlCommons.java | 2 ++ .../download/VariationDownloadManager.java | 31 ++++++++----------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 29bf940175..b84fe7d399 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -53,6 +53,8 @@ download: XREFS: "script:gene_extra_info.pl@xrefs.txt" CANONICAL: "script:ensembl_canonical.pl@ensembl_canonical.txt" GENOME_INFO: "script:genome_info.pl@genome_info.json" + MMUSCULUS_VARIATION: "release-put_release_here/variation/vcf/put_species_here/put_species_here.vcf.gz" + MMUSCULUS_STRUCTURAL_VARIATIONS: "release-put_release_here/variation/vcf/put_species_here/put_species_here_structural_variations.vcf.gz" ensemblGenomes: database: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e94acaf4bf..7e7941bcf7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -86,6 +86,8 @@ public final class EtlCommons { public static final String ENSEMBL_XREFS_FILE_ID = "XREFS"; public static final String ENSEMBL_CANONICAL_FILE_ID = "CANONICAL"; public static final String GENOME_INFO_FILE_ID = "GENOME_INFO"; + public static final String VARIATION_FILE_ID = "VARIATION"; + public static final String STRUCTURAL_VARIATIONS_FILE_ID = "STRUCTURAL_VARIATIONS"; // Genome public static final String GENOME_DATA = "genome"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java index 376369eedd..ce396db682 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -24,6 +24,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -56,26 +57,20 @@ public List downloadVariation() throws IOException, InterruptedExc logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA)); - List urls = new ArrayList<>(); - - String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString(); - String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" - + speciesShortName + ".vcf.gz"; - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - urls.add(url); - logger.info(OK_LOG_MESSAGE); + DownloadFile downloadFile; + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString(); - url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/" - + speciesShortName + "_structural_variations.vcf.gz"; - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - urls.add(url); - logger.info(OK_LOG_MESSAGE); + // Variation and structural variations + List fileIds = Arrays.asList(prefixId + VARIATION_FILE_ID, prefixId + STRUCTURAL_VARIATIONS_FILE_ID); + List urls = new ArrayList<>(); + for (String fileId : fileIds) { + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), fileId, null, variationFolder); + downloadFiles.add(downloadFile); + urls.add(downloadFile.getUrl()); + } - saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), urls, variationFolder.resolve( - getDataVersionFilename(VARIATION_DATA))); + saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), urls, + variationFolder.resolve(getDataVersionFilename(VARIATION_DATA))); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(VARIATION_DATA)); } From d483dcf6f757f853e88532dc4d1a694bfc359935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 1 Aug 2024 10:22:55 +0200 Subject: [PATCH 118/148] app: improve CellBase loader by creating a new function to be reused by the different data, e.g., repeats, #TASK-6142, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 77 +++++++++++-------- .../org/opencb/cellbase/lib/EtlCommons.java | 6 +- .../cellbase/lib/builders/RepeatsBuilder.java | 3 +- 3 files changed, 47 insertions(+), 39 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 7861f25dbe..b6af2caa3b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -45,6 +45,7 @@ import java.util.concurrent.ExecutionException; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; /** * Created by imedina on 03/02/15. @@ -473,34 +474,9 @@ private void loadClinical() throws FileNotFoundException { } } - private void loadRepeats() { - Path path = input.resolve(EtlCommons.REPEATS_JSON + ".json.gz"); - if (Files.exists(path)) { - try { - // Load data - logger.debug("Loading '{}' ...", path); - loadRunner.load(path, "repeats", dataRelease); - - // Create index - createIndex("repeats"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve(getDataVersionFilename(TRF_DATA)), - input.resolve(getDataVersionFilename(GSD_DATA)), - input.resolve(getDataVersionFilename(WM_DATA)) - )); - dataReleaseManager.update(dataRelease, "repeats", EtlCommons.REPEATS_DATA, sources); - } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { - logger.error(e.toString()); - } catch (LoaderException e) { - e.printStackTrace(); - } - } else { - logger.warn("Repeats file {} not found", path); - logger.warn("No repeats data will be loaded"); - } + private void loadRepeats() throws CellBaseException { + Path jsonPath = input.resolve(REPEATS_DATA).resolve(REPEATS_OUTPUT_FILENAME); + loadJson(REPEATS_DATA, jsonPath); } private void loadSpliceScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, @@ -591,16 +567,51 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); } - private void createIndex(String collection) { + private void loadJson(String data, Path jsonPath) throws CellBaseException { + String dataName = getDataName(data); + if (!Files.exists(jsonPath)) { + logger.warn("JSON file {} not found", jsonPath); + logger.warn("No '{}' data will be loaded", dataName); + return; + } + + try { + // Load data + logger.debug("Loading JSON file '{}' ...", jsonPath); + loadRunner.load(jsonPath, "repeats", dataRelease); + + // Create index + createIndex(data); + + // Update release (collection and sources) + List sources = new ArrayList<>(); + for (File file : jsonPath.getParent().toFile().listFiles()) { + if (file.getName().endsWith(SUFFIX_VERSION_FILENAME)) { + sources.add(file.getAbsoluteFile().toPath()); + } + } + dataReleaseManager.update(dataRelease, data, data, sources); + } catch (Exception e) { + throw new CellBaseException("Error loading data '" + dataName + "'", e); + } + } + + private void createIndex(String data) { if (!createIndexes) { return; } - String collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); - logger.info("Loading indexes for '{}' collection ...", collectionName); + + String dataName = null; + String collectionName = null; try { + dataName = getDataName(data); + collectionName = CellBaseDBAdaptor.buildCollectionName(data, dataRelease); + logger.info("Creating indexes for data '{}' in collection '{}' ...", dataName, collectionName); indexManager.createMongoDBIndexes(Collections.singletonList(collectionName), true); - } catch (IOException e) { - logger.error("Error creating index: {}", e.getMessage()); + logger.info(DONE_LOG_MESSAGE); + } catch (IOException | CellBaseException e) { + logger.error("Error creating indexes for data '{}' in collection '{}': {}", dataName, collectionName, + Arrays.toString(e.getStackTrace())); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 7e7941bcf7..ea0bd02b94 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -66,6 +66,7 @@ public final class EtlCommons { public static final String GZ_EXTENSION = ".gz"; public static final String OK_LOG_MESSAGE = "Ok."; + public static final String DONE_LOG_MESSAGE = "Done."; // Ensembl public static final String ENSEMBL_DATA = "ensembl"; @@ -211,11 +212,6 @@ public final class EtlCommons { // Repeats public static final String REPEATS_DATA = "repeats"; - /** - * @deprecated (when refactoring downloaders, builders and loaders) - */ - @Deprecated - public static final String REPEATS_JSON = "repeats"; // Simple repeats public static final String TRF_DATA = "trf"; // Must match the configuration file diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index ce55659f65..5add326db7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -44,7 +44,8 @@ public class RepeatsBuilder extends AbstractBuilder { private List dataList; private final Path filesDir; - public static final String REPEATS_OUTPUT_FILENAME = EtlCommons.REPEATS_DATA + ".json.gz"; + public static final String REPEATS_OUTPUT_BASENAME = "repeats"; + public static final String REPEATS_OUTPUT_FILENAME = REPEATS_OUTPUT_BASENAME + ".json.gz"; public RepeatsBuilder(List dataList, Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); From 7f62ce72b8af94b6e0d7eba59e70d143d4a96e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 1 Aug 2024 11:58:33 +0200 Subject: [PATCH 119/148] lib: improve genome sequence and info loader, #TASK-6142, #TASK-5564 - Re-using the fucntion loadJsonFile - Adding a mongodb index for the collection genome info - Adding log messages --- .../admin/executors/BuildCommandExecutor.java | 6 +- .../admin/executors/LoadCommandExecutor.java | 89 ++++++++++++------- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../builders/GenomeSequenceFastaBuilder.java | 6 +- .../cellbase/lib/loader/LoadRunner.java | 2 +- .../src/main/resources/mongodb-indexes.json | 2 + 6 files changed, 65 insertions(+), 41 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index aff9c5cc89..2b9f53fbf8 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -48,7 +48,7 @@ import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; -import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; import static org.opencb.cellbase.lib.builders.ProteinBuilder.OUTPUT_PROTEIN_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; @@ -195,7 +195,7 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException { Path genomeDownloadFolder = downloadFolder.resolve(GENOME_DATA); Path genomeBuildFolder = buildFolder.resolve(GENOME_DATA); - if (Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME)) + if (Files.exists(genomeBuildFolder.resolve(GENOME_JSON_FILENAME)) && Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME)) && Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) { logger.warn(DATA_ALREADY_BUILT, getDataName(GENOME_DATA)); @@ -222,7 +222,7 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException { } // Parse file - if (!Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME))) { + if (!Files.exists(genomeBuildFolder.resolve(GENOME_JSON_FILENAME))) { // Get FASTA path Path fastaPath = getFastaReferenceGenome(); diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index b6af2caa3b..8e008b6c97 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -45,7 +45,9 @@ import java.util.concurrent.ExecutionException; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; /** * Created by imedina on 03/02/15. @@ -128,24 +130,7 @@ public void execute() throws CellBaseException { try { switch (loadOption) { case EtlCommons.GENOME_DATA: { - // Load data - if (input.resolve("genome_info.json").toFile().exists()) { - loadIfExists(input.resolve("genome_info.json"), "genome_info"); - } else { - loadIfExists(input.resolve("genome_info.json.gz"), "genome_info"); - } - loadIfExists(input.resolve("genome_sequence.json.gz"), "genome_sequence"); - - // Create index - createIndex("genome_info"); - createIndex("genome_sequence"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("genomeVersion.json") - )); - dataReleaseManager.update(dataRelease, "genome_info", EtlCommons.GENOME_DATA, sources); - dataReleaseManager.update(dataRelease, "genome_sequence", null, null); + loadGenome(); break; } case EtlCommons.GENE_DATA: { @@ -474,6 +459,18 @@ private void loadClinical() throws FileNotFoundException { } } + private void loadGenome() throws CellBaseException { + // Genome sequence + Path jsonPath = input.resolve(GENOME_DATA).resolve(GENOME_JSON_FILENAME); + loadJson(GENOME_DATA, GENOME_SEQUENCE_COLLECTION_NAME, jsonPath); + + // Genome info + jsonPath = input.resolve(GENOME_DATA).resolve(GENOME_INFO_FILENAME); + // The fourh parameter to null is required to avoid read and load the genome version file since it was done previously + // when loading the GENOME_JSON_FILENAME into the collection GENOME_SEQUENCE_COLLECTION_NAME + loadJson(GENOME_INFO_DATA, GENOME_INFO_DATA, jsonPath, null); + } + private void loadRepeats() throws CellBaseException { Path jsonPath = input.resolve(REPEATS_DATA).resolve(REPEATS_OUTPUT_FILENAME); loadJson(REPEATS_DATA, jsonPath); @@ -568,35 +565,61 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { } private void loadJson(String data, Path jsonPath) throws CellBaseException { - String dataName = getDataName(data); - if (!Files.exists(jsonPath)) { - logger.warn("JSON file {} not found", jsonPath); - logger.warn("No '{}' data will be loaded", dataName); + loadJson(data, data, jsonPath); + } + + private void loadJson(String data, String collection, Path jsonPath) throws CellBaseException { + if (!existsJsonFile(jsonPath, data)) { + return; + } + List sources = new ArrayList<>(); + for (File file : jsonPath.getParent().toFile().listFiles()) { + if (file.getName().endsWith(SUFFIX_VERSION_FILENAME)) { + sources.add(file.getAbsoluteFile().toPath()); + } + } + loadJson(data, collection, jsonPath, sources); + } + + private void loadJson(String data, String collection, Path jsonPath, List sources) throws CellBaseException { + if (!existsJsonFile(jsonPath, data)) { return; } + String dataName = getDataName(data); + try { // Load data - logger.debug("Loading JSON file '{}' ...", jsonPath); - loadRunner.load(jsonPath, "repeats", dataRelease); + logger.info("Loading JSON file '{}' ...", jsonPath); + loadRunner.load(jsonPath, collection, dataRelease); + logger.info(DONE_LOG_MESSAGE); // Create index - createIndex(data); + createIndex(data, collection); // Update release (collection and sources) - List sources = new ArrayList<>(); - for (File file : jsonPath.getParent().toFile().listFiles()) { - if (file.getName().endsWith(SUFFIX_VERSION_FILENAME)) { - sources.add(file.getAbsoluteFile().toPath()); - } - } - dataReleaseManager.update(dataRelease, data, data, sources); + dataReleaseManager.update(dataRelease, collection, data, sources); } catch (Exception e) { throw new CellBaseException("Error loading data '" + dataName + "'", e); } } + private boolean existsJsonFile(Path jsonPath, String data) throws CellBaseException { + String dataName = getDataName(data); + if (!Files.exists(jsonPath)) { + logger.warn("JSON file {} not found", jsonPath); + logger.warn("No '{}' data will be loaded", dataName); + return false; + } + return true; + } + + @Deprecated private void createIndex(String data) { + createIndex(data, data); + } + + private void createIndex(String data, String collection) { if (!createIndexes) { return; } @@ -605,7 +628,7 @@ private void createIndex(String data) { String collectionName = null; try { dataName = getDataName(data); - collectionName = CellBaseDBAdaptor.buildCollectionName(data, dataRelease); + collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); logger.info("Creating indexes for data '{}' in collection '{}' ...", dataName, collectionName); indexManager.createMongoDBIndexes(Collections.singletonList(collectionName), true); logger.info(DONE_LOG_MESSAGE); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index ea0bd02b94..16cda025e0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -92,6 +92,7 @@ public final class EtlCommons { // Genome public static final String GENOME_DATA = "genome"; + public static final String GENOME_SEQUENCE_COLLECTION_NAME = "genome_sequence"; public static final String GENOME_INFO_DATA = "genome_info"; // Gene diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java index e9395cceea..521c5f3a71 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java @@ -20,21 +20,19 @@ import org.opencb.biodata.models.core.GenomeSequenceChunk; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.IOException; import java.nio.file.Path; -import static org.opencb.cellbase.lib.EtlCommons.*; - public class GenomeSequenceFastaBuilder extends AbstractBuilder { private Path genomeReferenceFastaFile; private static final int CHUNK_SIZE = 2000; - public static final String GENOME_OUTPUT_FILENAME = EtlCommons.GENOME_DATA + ".json.gz"; + public static final String GENOME_JSON_BASENAME = "genome"; + public static final String GENOME_JSON_FILENAME = GENOME_JSON_BASENAME + ".json.gz"; public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSerializer serializer) { super(serializer); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java index f921403ffa..3904099332 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java @@ -195,7 +195,7 @@ private int readInputJsonFile(Path inputFile) { batch = new ArrayList<>(batchSize); } if (inputFileRecords % batchSize == 0) { - logger.info("{} records read from {}", inputFileRecords, inputFile.toString()); + logger.debug("{} records read from {}", inputFileRecords, inputFile); } } br.close(); diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index 965effb194..9a3a023300 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -16,6 +16,8 @@ {"collection": "conservation", "fields": {"_chunkIds": 1}, "options": {"background": true}} {"collection": "conservation", "fields": {"chromosome": 1, "start": 1, "end": 1}, "options": {"background": true}} +{"collection": "genome_info", "fields": {"supercontigs.name": 1}, "options": {"background": true}} + {"collection": "genome_sequence", "fields": {"_chunkIds": 1}, "options": {"background": true}} {"collection": "genome_sequence", "fields": {"chromosome": 1, "start": 1, "end": 1}, "options": {"background": true}} {"collection": "genome_sequence", "fields": {"sequenceType": 1}, "options": {"background": true}} From 0602bba6c6759d59e9a4fd845e40efea7f246b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 1 Aug 2024 16:11:46 +0200 Subject: [PATCH 120/148] app: update CellBase loader for conservation data, #TASK-6142, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 191 ++++++++---------- .../lib/managers/DataReleaseManager.java | 86 +++++--- .../lib/GenericMongoDBAdaptorTest.java | 8 +- 3 files changed, 150 insertions(+), 135 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 8e008b6c97..ccda7b0291 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -38,14 +38,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; import java.util.concurrent.ExecutionException; import static org.opencb.cellbase.lib.EtlCommons.*; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; +import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -150,7 +148,7 @@ public void execute() throws CellBaseException { input.resolve("disgenetVersion.json"), input.resolve("gnomadVersion.json") )); - dataReleaseManager.update(dataRelease, "gene", EtlCommons.GENE_DATA, sources); + dataReleaseManager.update(dataRelease, "gene", sources); break; } case EtlCommons.REFSEQ_DATA: { @@ -163,7 +161,7 @@ public void execute() throws CellBaseException { // Update release (collection and sources) List sources = new ArrayList<>( Collections.singletonList(input.resolve("refseqVersion.json"))); - dataReleaseManager.update(dataRelease, "refseq", EtlCommons.REFSEQ_DATA, sources); + dataReleaseManager.update(dataRelease, "refseq", sources); break; } case EtlCommons.VARIATION_DATA: { @@ -180,8 +178,7 @@ public void execute() throws CellBaseException { // Update release (collection and sources) List sources = new ArrayList<>(Collections.singletonList(input.resolve("caddVersion.json"))); - dataReleaseManager.update(dataRelease, "variation_functional_score", - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, "variation_functional_score", sources); break; } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { @@ -194,28 +191,15 @@ public void execute() throws CellBaseException { // Update release (collection and sources) List sources = new ArrayList<>(Collections.singletonList(input.resolve("revelVersion.json"))); - dataReleaseManager.update(dataRelease, "missense_variation_functional_score", - EtlCommons.MISSENSE_VARIATION_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, "missense_variation_functional_score", sources); break; } case EtlCommons.CONSERVATION_DATA: { - // Load data, create index and update release loadConservation(); break; } case EtlCommons.REGULATION_DATA: { - // Load data (regulatory region and regulatory PFM)) - loadIfExists(input.resolve("regulatory_region.json.gz"), "regulatory_region"); - loadIfExists(input.resolve("regulatory_pfm.json.gz"), "regulatory_pfm"); - - // Create index - createIndex("regulatory_region"); - createIndex("regulatory_pfm"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Collections.singletonList(input.resolve("ensemblRegulationVersion.json"))); - dataReleaseManager.update(dataRelease, "regulatory_region", EtlCommons.REGULATION_DATA, sources); - dataReleaseManager.update(dataRelease, "regulatory_pfm", null, null); + loadRegulation(); break; } case EtlCommons.PROTEIN_DATA: { @@ -230,7 +214,7 @@ public void execute() throws CellBaseException { input.resolve("uniprotVersion.json"), input.resolve("interproVersion.json") )); - dataReleaseManager.update(dataRelease, "protein", EtlCommons.PROTEIN_DATA, sources); + dataReleaseManager.update(dataRelease, "protein", sources); break; } // case EtlCommons.PPI_DATA: @@ -249,7 +233,6 @@ public void execute() throws CellBaseException { break; } case EtlCommons.REPEATS_DATA: { - // Load data, create index and update release loadRepeats(); break; } @@ -269,7 +252,7 @@ public void execute() throws CellBaseException { input.resolve(EtlCommons.GO_VERSION_FILE), input.resolve(EtlCommons.DO_VERSION_FILE) )); - dataReleaseManager.update(dataRelease, "ontology", EtlCommons.ONTOLOGY_DATA, sources); + dataReleaseManager.update(dataRelease, "ontology", sources); break; } case EtlCommons.SPLICE_SCORE_DATA: { @@ -378,7 +361,7 @@ private void loadVariationData() throws NoSuchMethodException, InterruptedExcept List sources = new ArrayList<>(Arrays.asList( input.resolve("ensemblVariationVersion.json") )); - dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources); + dataReleaseManager.update(dataRelease, "variation", sources); // Custom update required e.g. population freqs loading } else { @@ -390,25 +373,7 @@ private void loadVariationData() throws NoSuchMethodException, InterruptedExcept private void loadConservation() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { - // Load data - DirectoryStream stream = Files.newDirectoryStream(input, - entry -> entry.getFileName().toString().startsWith("conservation_")); - - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "conservation", dataRelease); - } - - // Create index - createIndex("conservation"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("gerpVersion.json"), - input.resolve("phastConsVersion.json"), - input.resolve("phyloPVersion.json") - )); - dataReleaseManager.update(dataRelease, "conservation", EtlCommons.CONSERVATION_DATA, sources); + loadData(input.resolve(CONSERVATION_DATA), CONSERVATION_DATA, "conservation_"); } private void loadProteinFunctionalPrediction() throws NoSuchMethodException, InterruptedException, ExecutionException, @@ -427,7 +392,7 @@ private void loadProteinFunctionalPrediction() throws NoSuchMethodException, Int createIndex("protein_functional_prediction"); // Update release (collection and sources) - dataReleaseManager.update(dataRelease, "protein_functional_prediction", null, null); + dataReleaseManager.update(dataRelease, "protein_functional_prediction", null); } private void loadClinical() throws FileNotFoundException { @@ -447,7 +412,7 @@ private void loadClinical() throws FileNotFoundException { input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); - dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANT_DATA, sources); + dataReleaseManager.update(dataRelease, "clinical_variants", sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { logger.error(e.toString()); @@ -460,20 +425,26 @@ private void loadClinical() throws FileNotFoundException { } private void loadGenome() throws CellBaseException { - // Genome sequence - Path jsonPath = input.resolve(GENOME_DATA).resolve(GENOME_JSON_FILENAME); - loadJson(GENOME_DATA, GENOME_SEQUENCE_COLLECTION_NAME, jsonPath); - - // Genome info - jsonPath = input.resolve(GENOME_DATA).resolve(GENOME_INFO_FILENAME); - // The fourh parameter to null is required to avoid read and load the genome version file since it was done previously - // when loading the GENOME_JSON_FILENAME into the collection GENOME_SEQUENCE_COLLECTION_NAME - loadJson(GENOME_INFO_DATA, GENOME_INFO_DATA, jsonPath, null); + HashMap collectionMap = new HashMap<>(); + collectionMap.put(GENOME_SEQUENCE_COLLECTION_NAME, GENOME_JSON_FILENAME); + collectionMap.put(GENOME_INFO_DATA, GENOME_INFO_FILENAME); + + loadData(input.resolve(GENOME_DATA), collectionMap); } private void loadRepeats() throws CellBaseException { - Path jsonPath = input.resolve(REPEATS_DATA).resolve(REPEATS_OUTPUT_FILENAME); - loadJson(REPEATS_DATA, jsonPath); + HashMap collectionMap = new HashMap<>(); + collectionMap.put(REPEATS_DATA, REPEATS_OUTPUT_FILENAME); + + loadData(input.resolve(REPEATS_DATA), collectionMap); + } + + private void loadRegulation() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(REGULATORY_REGION_BASENAME, REGULATORY_REGION_OUTPUT_FILENAME); + collectionMap.put(REGULATORY_PFM_BASENAME, REGULATORY_PFM_OUTPUT_FILENAME); + + loadData(input.resolve(REGULATION_DATA), collectionMap); } private void loadSpliceScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, @@ -493,7 +464,7 @@ private void loadSpliceScores() throws NoSuchMethodException, InterruptedExcepti input.resolve(SPLICE_SCORE_DATA + "/" + getDataVersionFilename(MMSPLICE_DATA)), input.resolve(SPLICE_SCORE_DATA + "/" + getDataVersionFilename(SPLICEAI_DATA)) )); - dataReleaseManager.update(dataRelease, "splice_score", SPLICE_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, SPLICE_SCORE_DATA, sources); } private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionException, InterruptedException, @@ -531,7 +502,7 @@ private void loadPubMed() throws CellBaseException { // Update release (collection and sources) List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.getDataVersionFilename(PUBMED_DATA))); - dataReleaseManager.update(dataRelease, PUBMED_DATA, PUBMED_DATA, sources); + dataReleaseManager.update(dataRelease, PUBMED_DATA, sources); } else { logger.warn("PubMed folder {} not found", pubmedPath); } @@ -561,81 +532,93 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { // Update release (collection and sources) List sources = Collections.singletonList(pharmaPath.resolve(getDataVersionFilename(PHARMGKB_DATA))); - dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); + dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, sources); } - private void loadJson(String data, Path jsonPath) throws CellBaseException { - loadJson(data, data, jsonPath); + private void loadData(Path buildPath, Map collectionMap) throws CellBaseException { + // Load data from the different files into the input collections + for (Map.Entry entry : collectionMap.entrySet()) { + Path jsonPath = buildPath.resolve(entry.getValue()); + loadJsonFile(entry.getKey(), jsonPath); + } + + // Load sources + loadSources(buildPath); } - private void loadJson(String data, String collection, Path jsonPath) throws CellBaseException { - if (!existsJsonFile(jsonPath, data)) { - return; - } - List sources = new ArrayList<>(); - for (File file : jsonPath.getParent().toFile().listFiles()) { - if (file.getName().endsWith(SUFFIX_VERSION_FILENAME)) { - sources.add(file.getAbsoluteFile().toPath()); + private void loadData(Path buildPath, String collection, String prefix) throws CellBaseException, IOException { + // Load data + DirectoryStream stream = Files.newDirectoryStream(buildPath, entry -> entry.getFileName().toString().startsWith(prefix)); + + try { + for (Path entry : stream) { + logger.info("Loading JSON file '{}' ...", entry); + loadRunner.load(buildPath.resolve(entry.getFileName()), collection, dataRelease); + logger.info(DONE_LOG_MESSAGE); } + } catch (Exception e) { + throw new CellBaseException("Error loading data in collection '" + collection + "'", e); } - loadJson(data, collection, jsonPath, sources); + + // Create index + createIndex(collection); + + // Update the data release collection + dataReleaseManager.update(dataRelease, collection, getVersionPaths(buildPath)); } - private void loadJson(String data, String collection, Path jsonPath, List sources) throws CellBaseException { - if (!existsJsonFile(jsonPath, data)) { + private void loadJsonFile(String collection, Path jsonPath) throws CellBaseException { + if (!Files.exists(jsonPath)) { + logger.warn("JSON file '{}' not found. No data will be loaded in collection '{}'.", jsonPath, + CellBaseDBAdaptor.buildCollectionName(collection, dataRelease)); return; } - String dataName = getDataName(data); - try { // Load data logger.info("Loading JSON file '{}' ...", jsonPath); loadRunner.load(jsonPath, collection, dataRelease); logger.info(DONE_LOG_MESSAGE); - - // Create index - createIndex(data, collection); - - // Update release (collection and sources) - dataReleaseManager.update(dataRelease, collection, data, sources); } catch (Exception e) { - throw new CellBaseException("Error loading data '" + dataName + "'", e); + throw new CellBaseException("Error loading data in collection '" + collection + "'", e); } - } - private boolean existsJsonFile(Path jsonPath, String data) throws CellBaseException { - String dataName = getDataName(data); - if (!Files.exists(jsonPath)) { - logger.warn("JSON file {} not found", jsonPath); - logger.warn("No '{}' data will be loaded", dataName); - return false; - } - return true; - } + // Create index + createIndex(collection); - @Deprecated - private void createIndex(String data) { - createIndex(data, data); + // Update collection in data release + dataReleaseManager.update(dataRelease, collection); } - private void createIndex(String data, String collection) { + private void createIndex(String collection) { if (!createIndexes) { return; } - String dataName = null; String collectionName = null; try { - dataName = getDataName(data); collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); - logger.info("Creating indexes for data '{}' in collection '{}' ...", dataName, collectionName); + logger.info("Creating indexes for collection '{}' ...", collectionName); indexManager.createMongoDBIndexes(Collections.singletonList(collectionName), true); logger.info(DONE_LOG_MESSAGE); - } catch (IOException | CellBaseException e) { - logger.error("Error creating indexes for data '{}' in collection '{}': {}", dataName, collectionName, - Arrays.toString(e.getStackTrace())); + } catch (IOException e) { + logger.error("Error creating indexes for collection '{}': {}", collectionName, Arrays.toString(e.getStackTrace())); + } + } + + private void loadSources(Path path) throws CellBaseException { + // Update data source in data release + dataReleaseManager.updateSources(dataRelease, getVersionPaths(path)); + } + + private List getVersionPaths(Path path) { + List sources = new ArrayList<>(); + for (File file : path.toFile().listFiles()) { + if (file.getName().endsWith(SUFFIX_VERSION_FILENAME)) { + sources.add(file.getAbsoluteFile().toPath()); + } } + return sources; } private DataRelease getDataReleaseForLoading(DataReleaseManager dataReleaseManager) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java index 331b52e5c3..c73f80ff89 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java @@ -126,7 +126,11 @@ public DataRelease update(int release, List versions) throws CellBaseExc return releaseDBAdaptor.update(release, versions).first(); } - public DataRelease update(int release, String collection, String data, List dataSourcePaths) + public DataRelease update(int release, String collection) throws CellBaseException { + return update(release, collection, Collections.emptyList()); + } + + public DataRelease update(int release, String collection, List dataSourcePaths) throws CellBaseException { DataRelease currDataRelease = get(release); if (currDataRelease != null) { @@ -134,32 +138,8 @@ public DataRelease update(int release, String collection, String data, List newSources = new ArrayList<>(); - - // First, add new data sources - Set sourceSet = new HashSet<>(); - ObjectMapper jsonObjectMapper = new ObjectMapper(); - ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataSource.class); - for (Path dataSourcePath : dataSourcePaths) { - if (dataSourcePath.toFile().exists()) { - try { - DataSource dataSource = jsonObjectReader.readValue(dataSourcePath.toFile()); - newSources.add(dataSource); - sourceSet.add(dataSource.getCategory() + "__" + dataSource.getName()); - } catch (IOException e) { - logger.warn("Something wrong happened when reading data release source {}: {}", dataSourcePath, e.getMessage()); - } - } - } - - // Second, add previous data sources if necessary (to avoid duplicated sources) - for (DataSource source : currDataRelease.getSources()) { - String key = source.getCategory() + "__" + source.getName(); - if (!sourceSet.contains(key)) { - newSources.add(source); - } - } + if (CollectionUtils.isNotEmpty(dataSourcePaths)) { + List newSources = getDataSources(dataSourcePaths, currDataRelease.getSources()); if (CollectionUtils.isNotEmpty(newSources)) { currDataRelease.setSources(newSources); @@ -174,6 +154,28 @@ public DataRelease update(int release, String collection, String data, List dataSourcePaths) throws CellBaseException { + DataRelease currDataRelease = get(release); + if (currDataRelease == null) { + throw new CellBaseException("Data release '" + release + "' does not exist" + getSpeciesAssemblyMessage()); + } + + // Check sources + if (CollectionUtils.isNotEmpty(dataSourcePaths)) { + List newSources = getDataSources(dataSourcePaths, currDataRelease.getSources()); + + if (CollectionUtils.isNotEmpty(newSources)) { + currDataRelease.setSources(newSources); + } + } + + // Update data release in the database + update(currDataRelease); + + return currDataRelease; + + } + public void update(DataRelease dataRelase) { if (MapUtils.isNotEmpty(dataRelase.getCollections())) { releaseDBAdaptor.update(dataRelase.getRelease(), "collections", dataRelase.getCollections()); @@ -243,4 +245,34 @@ public DataRelease checkDataRelease(int inRelease) throws CellBaseException { private String getSpeciesAssemblyMessage() { return " (species = " + species + ", assembly = " + assembly + ")"; } + + private List getDataSources(List dataSourcePaths, List currDataSources) { + List newDataSources = new ArrayList<>(); + + // First, add new data sources + Set sourceSet = new HashSet<>(); + ObjectMapper jsonObjectMapper = new ObjectMapper(); + ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataSource.class); + for (Path dataSourcePath : dataSourcePaths) { + if (dataSourcePath.toFile().exists()) { + try { + DataSource dataSource = jsonObjectReader.readValue(dataSourcePath.toFile()); + newDataSources.add(dataSource); + sourceSet.add(dataSource.getCategory() + "__" + dataSource.getName()); + } catch (IOException e) { + logger.warn("Something wrong happened when reading data release source {}: {}", dataSourcePath, e.getMessage()); + } + } + } + + // Second, add previous data sources if necessary (to avoid duplicated sources) + for (DataSource source : currDataSources) { + String key = source.getCategory() + "__" + source.getName(); + if (!sourceSet.contains(key)) { + newDataSources.add(source); + } + } + + return newDataSources; + } } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java index 1b217c671d..4eaf34b026 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java @@ -160,7 +160,7 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class loadData("conservation", "conservation", file.toPath(), true); } } - dataReleaseManager.update(dataRelease.getRelease(), "conservation", "conservation", Collections.emptyList()); + dataReleaseManager.update(dataRelease.getRelease(), "conservation", Collections.emptyList()); // Regulatory regions: regulatory_region.json.gz loadData("regulatory_region", "regulatory_region", baseDir.resolve("regulatory_region.json.gz")); @@ -174,7 +174,7 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class loadData("protein_functional_prediction", "protein_functional_prediction", file.toPath(), true); } } - dataReleaseManager.update(dataRelease.getRelease(), "protein_functional_prediction", "protein_functional_prediction", Collections.emptyList()); + dataReleaseManager.update(dataRelease.getRelease(), "protein_functional_prediction", Collections.emptyList()); // Variation: variation_chr_all.json.gz loadData("variation", "variation", baseDir.resolve("variation_chr_all.json.gz")); @@ -195,7 +195,7 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class // splice_score loadData("splice_score", "splice_score", baseDir.resolve("splice_score/spliceai/splice_score_all.json.gz"), true); loadData("splice_score", "splice_score", baseDir.resolve("splice_score/mmsplice/splice_score_all.json.gz"), true); - dataReleaseManager.update(dataRelease.getRelease(), "splice_score", "splice_score", Collections.emptyList()); + dataReleaseManager.update(dataRelease.getRelease(), "splice_score", Collections.emptyList()); // clinical_variants.full.json.gz loadData("clinical_variants", "clinical_variants", baseDir.resolve("clinical_variants.full.json.gz")); @@ -221,7 +221,7 @@ private void loadData(String collection, String data, Path filePath, boolean ski logger.info("Loading (" + collection + ", " + data + ") from file " + filePath); loadRunner.load(filePath, collection, dataRelease.getRelease()); if (!skipUpdate) { - dataReleaseManager.update(dataRelease.getRelease(), collection, data, Collections.emptyList()); + dataReleaseManager.update(dataRelease.getRelease(), collection, Collections.emptyList()); } } else { logger.error("(" + collection + ", " + data + ") not loading: file " + filePath + "does not exist"); From 2b4fbeb5ebd75cd302b24c8eac54aea840704a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 1 Aug 2024 17:11:37 +0200 Subject: [PATCH 121/148] app: update CellBase loader for genes and proteins according to the previous changes, #TASK-6142, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 4 +- .../admin/executors/LoadCommandExecutor.java | 62 ++++++------------- .../cellbase/lib/builders/ProteinBuilder.java | 2 +- 3 files changed, 23 insertions(+), 45 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 2b9f53fbf8..733a4c1e2a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -49,7 +49,7 @@ import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; -import static org.opencb.cellbase.lib.builders.ProteinBuilder.OUTPUT_PROTEIN_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; @@ -373,7 +373,7 @@ private AbstractBuilder buildProtein() throws CellBaseException { // Sanity check Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); - List filesToCheck = Arrays.asList(proteinBuildPath.resolve(OUTPUT_PROTEIN_OUTPUT_FILENAME), + List filesToCheck = Arrays.asList(proteinBuildPath.resolve(PROTEIN_OUTPUT_FILENAME), proteinBuildPath.resolve(getDataVersionFilename(INTERPRO_DATA)), proteinBuildPath.resolve(getDataVersionFilename(INTACT_DATA)), proteinBuildPath.resolve(getDataVersionFilename(UNIPROT_DATA))); diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index ccda7b0291..b7ea20c17e 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -42,7 +42,10 @@ import java.util.concurrent.ExecutionException; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; +import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -132,36 +135,7 @@ public void execute() throws CellBaseException { break; } case EtlCommons.GENE_DATA: { - // Load data - loadIfExists(input.resolve("gene.json.gz"), "gene"); - - // Create index - createIndex("gene"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("dgidbVersion.json"), - input.resolve("ensemblCoreVersion.json"), - input.resolve("uniprotXrefVersion.json"), - input.resolve("geneExpressionAtlasVersion.json"), - input.resolve("hpoVersion.json"), - input.resolve("disgenetVersion.json"), - input.resolve("gnomadVersion.json") - )); - dataReleaseManager.update(dataRelease, "gene", sources); - break; - } - case EtlCommons.REFSEQ_DATA: { - // Load data - loadIfExists(input.resolve("refseq.json.gz"), "refseq"); - - // Create index - createIndex("refseq"); - - // Update release (collection and sources) - List sources = new ArrayList<>( - Collections.singletonList(input.resolve("refseqVersion.json"))); - dataReleaseManager.update(dataRelease, "refseq", sources); + loadGene(); break; } case EtlCommons.VARIATION_DATA: { @@ -203,18 +177,7 @@ public void execute() throws CellBaseException { break; } case EtlCommons.PROTEIN_DATA: { - // Load data - loadIfExists(input.resolve("protein.json.gz"), "protein"); - - // Create index - createIndex("protein"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("uniprotVersion.json"), - input.resolve("interproVersion.json") - )); - dataReleaseManager.update(dataRelease, "protein", sources); + loadProtein(); break; } // case EtlCommons.PPI_DATA: @@ -432,6 +395,14 @@ private void loadGenome() throws CellBaseException { loadData(input.resolve(GENOME_DATA), collectionMap); } + private void loadGene() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(GENE_DATA, ENSEMBL_GENE_OUTPUT_FILENAME); + collectionMap.put(REFSEQ_DATA, REFSEQ_GENE_OUTPUT_FILENAME); + + loadData(input.resolve(GENE_DATA), collectionMap); + } + private void loadRepeats() throws CellBaseException { HashMap collectionMap = new HashMap<>(); collectionMap.put(REPEATS_DATA, REPEATS_OUTPUT_FILENAME); @@ -447,6 +418,13 @@ private void loadRegulation() throws CellBaseException { loadData(input.resolve(REGULATION_DATA), collectionMap); } + private void loadProtein() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(PROTEIN_DATA, PROTEIN_OUTPUT_FILENAME); + + loadData(input.resolve(PROTEIN_DATA), collectionMap); + } + private void loadSpliceScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { // Load data diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 1407d02239..c7e490d424 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -48,7 +48,7 @@ public class ProteinBuilder extends AbstractBuilder { private Path proteinPath; private String species; - public static final String OUTPUT_PROTEIN_OUTPUT_FILENAME = PROTEIN_DATA + ".json.gz"; + public static final String PROTEIN_OUTPUT_FILENAME = PROTEIN_DATA + ".json.gz"; public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer serializer) { super(serializer); From d693f57afeb4902333c0841268e35bdbb542a50d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 1 Aug 2024 19:00:04 +0200 Subject: [PATCH 122/148] lib: add VariantBuilder to generate the variation JSON files from VCF files, and rename some constants, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 41 +++++++- .../org/opencb/cellbase/lib/EtlCommons.java | 14 +-- .../lib/builders/EnsemblGeneBuilder.java | 2 +- .../cellbase/lib/builders/GeneBuilder.java | 2 +- .../lib/builders/RefSeqGeneBuilder.java | 2 +- .../lib/builders/VariationBuilder.java | 94 +++++++++++++++++++ .../download/ConservationDownloadManager.java | 4 +- .../lib/download/GeneDownloadManager.java | 16 ++-- .../download/SpliceScoreDownloadManager.java | 2 +- .../download/VariationDownloadManager.java | 2 +- 10 files changed, 152 insertions(+), 27 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 733a4c1e2a..9e1c5b4bc7 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -35,10 +35,7 @@ import org.opencb.cellbase.lib.builders.clinical.variant.ClinicalVariantBuilder; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.StandardCopyOption; +import java.nio.file.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -53,6 +50,7 @@ import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.VariationBuilder.VARIATION_CHR_PREFIX; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -145,6 +143,9 @@ public void execute() throws CellBaseException { case PROTEIN_DATA: parser = buildProtein(); break; + case VARIATION_DATA: + parser = buildVariation(); + break; case REGULATION_DATA: parser = buildRegulation(); break; @@ -392,6 +393,36 @@ private AbstractBuilder buildProtein() throws CellBaseException { return new ProteinBuilder(proteinDownloadPath, speciesConfiguration.getScientificName(), serializer); } + private AbstractBuilder buildVariation() throws CellBaseException, IOException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(VARIATION_DATA)); + + // Sanity check + Path variationDownloadPath = downloadFolder.resolve(VARIATION_DATA); + Path variationBuildPath = buildFolder.resolve(VARIATION_DATA); + + List filesToCheck = new ArrayList<>(); + if (!speciesConfiguration.getId().equalsIgnoreCase(HSAPIENS)) { + filesToCheck.add(variationBuildPath.resolve(getDataVersionFilename(VARIATION_DATA))); + } + + try (DirectoryStream vcfPaths = Files.newDirectoryStream(variationBuildPath, + entry -> entry.getFileName().toString().startsWith(VARIATION_CHR_PREFIX))) { + if (AbstractBuilder.existFiles(filesToCheck) && vcfPaths.iterator().hasNext()) { + logger.warn(DATA_ALREADY_BUILT, getDataName(VARIATION_DATA)); + return null; + } + } + + // Copy version files + if (!speciesConfiguration.getId().equalsIgnoreCase(HSAPIENS)) { + copyVersionFiles(Arrays.asList(variationDownloadPath.resolve(getDataVersionFilename(VARIATION_DATA))), variationBuildPath); + } + + // Create the file serializer and the variation builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(variationBuildPath); + return new VariationBuilder(variationDownloadPath, speciesConfiguration.getScientificName(), serializer); + } + private AbstractBuilder buildConservation() throws CellBaseException { logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); @@ -429,7 +460,7 @@ private AbstractBuilder buildClinicalVariants() throws CellBaseException { private String getDefaultHumanAssembly() { for (SpeciesConfiguration species : configuration.getSpecies().getVertebrates()) { - if (species.getId().equals(HSAPIENS_NAME)) { + if (species.getId().equals(HSAPIENS)) { return species.getAssemblies().get(0).getName(); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 16cda025e0..d3b3147d38 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -42,12 +42,12 @@ public final class EtlCommons { // Commons - public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; - public static final String HSAPIENS_NAME= "hsapiens"; - public static final String MUS_MUSCULUS_NAME= "Mus musculus"; - public static final String RATTUS_NORVEGICUS_NAME= "Rattus norvegicus"; - public static final String BOS_TAURUS_NAME= "Bos taurus"; - public static final String DANIO_RERIO_NAME= "Danio rerio"; + public static final String HOMO_SAPIENS = "Homo sapiens"; + public static final String HSAPIENS = "hsapiens"; + public static final String MUS_MUSCULUS = "Mus musculus"; + public static final String RATTUS_NORVEGICUS = "Rattus norvegicus"; + public static final String BOS_TAURUS = "Bos taurus"; + public static final String DANIO_RERIO = "Danio rerio"; public static final String GRCH38_NAME = "GRCh38"; public static final String GRCH37_NAME = "GRCh37"; @@ -745,7 +745,7 @@ public static List getDataList(String data, CellBaseConfiguration config private static List getRepeatsDataList(CellBaseConfiguration configuration, SpeciesConfiguration speciesConfiguration) { List dataList = new ArrayList<>(); String speciesId = speciesConfiguration.getId().toUpperCase(Locale.ROOT); - if (speciesId.equalsIgnoreCase(HSAPIENS_NAME)) { + if (speciesId.equalsIgnoreCase(HSAPIENS)) { return Arrays.asList(TRF_DATA, WM_DATA, GSD_DATA); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index 044d9bc232..32d779e7ce 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -105,7 +105,7 @@ public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfigu transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { isHSapiens = true; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index b850f9b40a..785b296982 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -84,7 +84,7 @@ public static List getCommonDataSources(SpeciesConfiguration speciesConf List dataList = new ArrayList<>(); boolean isHSapiens = false; - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { isHSapiens = true; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index b470b2cb21..9ddb4e6a7c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -87,7 +87,7 @@ public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfigur transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { isHSapiens = true; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java new file mode 100644 index 0000000000..6f061ccd02 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -0,0 +1,94 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import org.opencb.biodata.formats.variant.io.VariantReader; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; +import org.opencb.biodata.tools.variant.VariantNormalizer; +import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; + +import java.io.IOException; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Locale; + +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS; + +/** + * Created by jtarraga on 01/08/24 + */ +public class VariationBuilder extends AbstractBuilder { + + private Path downloadPath; + private String species; + + public static final String VARIATION_CHR_PREFIX = "variation_chr"; + + public VariationBuilder(Path downloadPath, String species, CellBaseFileSerializer fileSerializer) { + super(fileSerializer); + + this.downloadPath = downloadPath; + this.species = species; + } + + @Override + public void parse() throws IOException { + if (!species.equalsIgnoreCase(HOMO_SAPIENS)) { + parseVcf(); + } + } + + private void parseVcf() throws IOException { + VariantNormalizer.VariantNormalizerConfig normalizerConfig = new VariantNormalizer.VariantNormalizerConfig() + .setReuseVariants(true) + .setNormalizeAlleles(true) + .setDecomposeMNVs(false); + + CellBaseJsonFileSerializer fileSerializer = (CellBaseJsonFileSerializer) this.serializer; + + // Usually we expect two VCF files prefixed by the species scientific name + // e.g., for 'Mus musculus' the VCF files are 'mus_musculus.vcf.gz' and 'mus_musculus_structural_variations.vcf.gz' + String prefix = species.toLowerCase(Locale.ROOT).replace(" ", "_"); + + try (DirectoryStream vcfPaths = Files.newDirectoryStream(downloadPath, + entry -> entry.getFileName().toString().startsWith(prefix))) { + for (Path vcfPath : vcfPaths) { + VariantStudyMetadata variantStudyMetadata = new VariantFileMetadata(vcfPath.getFileName().toString(), "") + .toVariantStudyMetadata(ENSEMBL_DATA); + VariantReader variantVcfReader = new VariantVcfHtsjdkReader(vcfPath, variantStudyMetadata, + new VariantNormalizer(normalizerConfig)); + + // Write variant to the JSON files according to the chromosome + Iterator iterator = variantVcfReader.iterator(); + while (iterator.hasNext()) { + Variant variant = iterator.next(); + fileSerializer.serialize(variant, VARIATION_CHR_PREFIX + variant.getChromosome()); + } + variantVcfReader.close(); + } + } + + fileSerializer.close(); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java index 64be42ed7c..7537e703f6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java @@ -87,7 +87,7 @@ public List downloadConservation() throws IOException, Interrupted String gerpUrl = null; // Human - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { // 1. PhastCons and PhyloP String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"}; @@ -128,7 +128,7 @@ public List downloadConservation() throws IOException, Interrupted } // Mouse - if (speciesConfiguration.getScientificName().equals(MUS_MUSCULUS_NAME)) { + if (speciesConfiguration.getScientificName().equals(MUS_MUSCULUS)) { String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); // 1. PhastCons and PhyloP diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 5af9f01097..6bb219fb2e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -83,7 +83,7 @@ public List download() throws IOException, InterruptedException, C logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); // Save data sources manually downloaded - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { // HPO if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)))) { logger.warn("The version file {} already exists", getDataVersionFilename(HPO_DISEASE_DATA)); @@ -245,7 +245,7 @@ private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(MANE_SELECT_DATA)), getDataName(MANE_SELECT_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); @@ -262,7 +262,7 @@ private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, Inte DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, geneDownloadPath); @@ -276,7 +276,7 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(HGNC_DATA)), getDataName(HGNC_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); @@ -291,7 +291,7 @@ private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOExcep DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(CANCER_HOTSPOT_DATA)), getDataName(CANCER_HOTSPOT_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); @@ -308,7 +308,7 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(DGIDB_DATA)), getDataName(DGIDB_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); @@ -341,7 +341,7 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GENE_EXPRESSION_ATLAS_DATA)), getDataName(GENE_EXPRESSION_ATLAS_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); @@ -358,7 +358,7 @@ private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOE DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GNOMAD_CONSTRAINTS_DATA)), getDataName(GNOMAD_CONSTRAINTS_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java index 20bacf80be..9f846f5cdb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java @@ -38,7 +38,7 @@ public SpliceScoreDownloadManager(String species, String assembly, Path outdir, @Override public List download() throws IOException, InterruptedException, CellBaseException { // Check if the species is supported - if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { logger.info("{} not supported for the species {}", getDataName(SPLICE_SCORE_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java index ce396db682..24be1bdc98 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -47,7 +47,7 @@ public List downloadVariation() throws IOException, InterruptedExc // Check if species is supported // and we do not need to download human variation data from Ensembl. It is already included in the CellBase. if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA) - && !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + && !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { Path variationFolder = downloadFolder.resolve(VARIATION_DATA); Files.createDirectories(variationFolder); From 38400c1e57abd2d67a07c4ef1fd460ac72a7f2b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 1 Aug 2024 19:15:22 +0200 Subject: [PATCH 123/148] app: update the CellBase loader for variation data according to the latest changes, #TASK-6142, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 30 +++++-------------- .../lib/builders/VariationBuilder.java | 2 +- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index b7ea20c17e..e343949f0f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -48,6 +48,7 @@ import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.VariationBuilder.VARIATION_CHR_PREFIX; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; /** @@ -139,8 +140,7 @@ public void execute() throws CellBaseException { break; } case EtlCommons.VARIATION_DATA: { - // Load data, create index and update release - loadVariationData(); + loadVariation(); break; } case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: { @@ -303,31 +303,15 @@ private void checkParameters() throws CellBaseException { dataRelease = getDataReleaseForLoading(dataReleaseManager).getRelease(); } - private void loadVariationData() throws NoSuchMethodException, InterruptedException, ExecutionException, + private void loadVariation() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, LoaderException, CellBaseException { - // First load data - // Common loading process from CellBase variation data models if (field == null) { - DirectoryStream stream = Files.newDirectoryStream(input, - entry -> entry.getFileName().toString().startsWith("variation_chr")); - - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "variation", dataRelease); - } - - // Create index - createIndex("variation"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("ensemblVariationVersion.json") - )); - dataReleaseManager.update(dataRelease, "variation", sources); - - // Custom update required e.g. population freqs loading + // First load data + // Common loading process from CellBase variation data models + loadData(input.resolve(VARIATION_DATA), VARIATION_DATA, VARIATION_CHR_PREFIX); } else { + // Custom update required e.g. population freqs loading logger.info("Loading file '{}'", input); loadRunner.load(input, "variation", dataRelease, field, innerFields); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java index 6f061ccd02..fc3d30dc25 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -36,7 +36,7 @@ import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS; /** - * Created by jtarraga on 01/08/24 + * Created by jtarraga on 01/08/24. */ public class VariationBuilder extends AbstractBuilder { From 311733715a1156760d0b98eb55f3744c41f9348b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Aug 2024 09:49:14 +0200 Subject: [PATCH 124/148] app: add check before building variation data, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 9e1c5b4bc7..89dc1cfc71 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -400,16 +400,18 @@ private AbstractBuilder buildVariation() throws CellBaseException, IOException { Path variationDownloadPath = downloadFolder.resolve(VARIATION_DATA); Path variationBuildPath = buildFolder.resolve(VARIATION_DATA); - List filesToCheck = new ArrayList<>(); - if (!speciesConfiguration.getId().equalsIgnoreCase(HSAPIENS)) { - filesToCheck.add(variationBuildPath.resolve(getDataVersionFilename(VARIATION_DATA))); - } + if (Files.exists(variationBuildPath)) { + List filesToCheck = new ArrayList<>(); + if (!speciesConfiguration.getId().equalsIgnoreCase(HSAPIENS)) { + filesToCheck.add(variationBuildPath.resolve(getDataVersionFilename(VARIATION_DATA))); + } - try (DirectoryStream vcfPaths = Files.newDirectoryStream(variationBuildPath, - entry -> entry.getFileName().toString().startsWith(VARIATION_CHR_PREFIX))) { - if (AbstractBuilder.existFiles(filesToCheck) && vcfPaths.iterator().hasNext()) { - logger.warn(DATA_ALREADY_BUILT, getDataName(VARIATION_DATA)); - return null; + try (DirectoryStream vcfPaths = Files.newDirectoryStream(variationBuildPath, + entry -> entry.getFileName().toString().startsWith(VARIATION_CHR_PREFIX))) { + if (AbstractBuilder.existFiles(filesToCheck) && vcfPaths.iterator().hasNext()) { + logger.warn(DATA_ALREADY_BUILT, getDataName(VARIATION_DATA)); + return null; + } } } From 9c810e7d879fbed68e379bbad665c57eae434542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Aug 2024 10:09:59 +0200 Subject: [PATCH 125/148] lib: skip API-KEY param when parsing variant quey, #TASK-5564 --- .../opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java index fc4b602cd9..29dab0f210 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java @@ -50,6 +50,7 @@ import java.util.*; import java.util.function.Consumer; +import static org.opencb.cellbase.core.ParamConstants.API_KEY_PARAM; import static org.opencb.cellbase.core.ParamConstants.DATA_RELEASE_PARAM; import static org.opencb.cellbase.lib.MongoDBCollectionConfiguration.VARIATION_FUNCTIONAL_SCORE_CHUNK_SIZE; @@ -253,6 +254,7 @@ public Bson parseQuery(VariantQuery query) { case "region": createRegionQuery(query, query.getRegions(), MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE, andBsonList); break; + case API_KEY_PARAM: case DATA_RELEASE_PARAM: case "svType": // don't do anything, this is parsed later From ec5f21a0599e1d6dd56733e9b6313d961e9edb36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Aug 2024 10:56:15 +0200 Subject: [PATCH 126/148] server: update RESTful server to take into account multi-species, #TASK-6426, #TASK-5564 --- .../org/opencb/cellbase/server/rest/FileWSServer.java | 8 +++++++- .../opencb/cellbase/server/rest/GenericRestWSServer.java | 5 ----- .../org/opencb/cellbase/server/rest/MetaWSServer.java | 8 ++++++-- .../opencb/cellbase/server/rest/PublicationWSServer.java | 7 ++++++- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/FileWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/FileWSServer.java index de97c37718..7c7058dc7a 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/FileWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/FileWSServer.java @@ -31,6 +31,7 @@ import javax.ws.rs.GET; import javax.ws.rs.Path; import javax.ws.rs.PathParam; +import javax.ws.rs.QueryParam; import javax.ws.rs.core.Context; import javax.ws.rs.core.Response; import javax.ws.rs.core.UriInfo; @@ -47,9 +48,14 @@ public class FileWSServer extends GenericRestWSServer { public FileWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVersion", value = ParamConstants.VERSION_DESCRIPTION, defaultValue = ParamConstants.DEFAULT_VERSION) String apiVersion, + @PathParam("species") + @ApiParam(name = "species", value = ParamConstants.SPECIES_DESCRIPTION, + defaultValue = ParamConstants.DEFAULT_SPECIES, required = true) String species, + @ApiParam(name = "assembly", value = ParamConstants.ASSEMBLY_DESCRIPTION, + defaultValue = ParamConstants.DEFAULT_ASSEMBLY) @QueryParam("assembly") String assembly, @Context UriInfo uriInfo, @Context HttpServletRequest hsr) throws CellBaseServerException { - super(apiVersion, uriInfo, hsr); + super(apiVersion, species, assembly, uriInfo, hsr); try { fileManager = cellBaseManagerFactory.getFileManager(); } catch (Exception e) { diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/GenericRestWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/GenericRestWSServer.java index fc961bb9ae..0969f874fb 100755 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/GenericRestWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/GenericRestWSServer.java @@ -101,11 +101,6 @@ public class GenericRestWSServer implements IWSServer { protected static String defaultApiKey; protected static ApiKeyManager apiKeyManager; - public GenericRestWSServer(@PathParam("version") String version, @Context UriInfo uriInfo, @Context HttpServletRequest hsr) - throws CellBaseServerException { - this(version, "hsapiens", null, uriInfo, hsr); - } - public GenericRestWSServer(@PathParam("version") String version, @PathParam("species") String species, @PathParam("assembly") String assembly, @Context UriInfo uriInfo, @Context HttpServletRequest hsr) diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java index 331f562585..db26f7ee83 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java @@ -71,9 +71,14 @@ public class MetaWSServer extends GenericRestWSServer { public MetaWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVersion", value = ParamConstants.VERSION_DESCRIPTION, defaultValue = ParamConstants.DEFAULT_VERSION) String apiVersion, + @PathParam("species") + @ApiParam(name = "species", value = ParamConstants.SPECIES_DESCRIPTION, + defaultValue = ParamConstants.DEFAULT_SPECIES, required = true) String species, + @ApiParam(name = "assembly", value = ParamConstants.ASSEMBLY_DESCRIPTION, + defaultValue = ParamConstants.DEFAULT_ASSEMBLY) @QueryParam("assembly") String assembly, @Context UriInfo uriInfo, @Context HttpServletRequest hsr) throws CellBaseServerException { - super(apiVersion, uriInfo, hsr); + super(apiVersion, species, assembly, uriInfo, hsr); try { metaManager = cellBaseManagerFactory.getMetaManager(); } catch (Exception e) { @@ -101,7 +106,6 @@ public Response getVersion(@PathParam("species") return createErrorResponse("getVersion", "Invalid species: '" + species + "' or assembly: '" + assembly + "'"); } - logger.error("species " + species); CellBaseDataResult queryResult = metaManager.getVersions(species, assembly); return createOkResponse(queryResult); } catch (CellBaseException e) { diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/PublicationWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/PublicationWSServer.java index 11734d11b3..a881f4be4e 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/PublicationWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/PublicationWSServer.java @@ -48,13 +48,18 @@ public class PublicationWSServer extends GenericRestWSServer { public PublicationWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVersion", value = ParamConstants.VERSION_DESCRIPTION, defaultValue = ParamConstants.DEFAULT_VERSION) String apiVersion, + @PathParam("species") + @ApiParam(name = "species", value = ParamConstants.SPECIES_DESCRIPTION, + defaultValue = ParamConstants.DEFAULT_SPECIES, required = true) String species, + @ApiParam(name = "assembly", value = ParamConstants.ASSEMBLY_DESCRIPTION, + defaultValue = ParamConstants.DEFAULT_ASSEMBLY) @QueryParam("assembly") String assembly, @ApiParam(name = "dataRelease", value = DATA_RELEASE_DESCRIPTION) @DefaultValue("0") @QueryParam("dataRelease") int dataRelease, @ApiParam(name = "apiKey", value = API_KEY_DESCRIPTION) @DefaultValue("") @QueryParam("apiKey") String apiKey, @Context UriInfo uriInfo, @Context HttpServletRequest hsr) throws CellBaseServerException { - super(apiVersion, uriInfo, hsr); + super(apiVersion, species, assembly, uriInfo, hsr); try { publicationManager = cellBaseManagerFactory.getPublicationManager(); } catch (Exception e) { From 36c36096b709589d59595ef3e06cabe483ebab24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Aug 2024 11:57:51 +0200 Subject: [PATCH 127/148] lib: extract the FutureSpliceScoreAnnotator in a file to reduce the VariantAnnotationCalculator file size, #TASK-6426, #TASK-5564 --- .../futures/FutureSpliceScoreAnnotator.java | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java new file mode 100644 index 0000000000..8cf478f7eb --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java @@ -0,0 +1,110 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.variant.annotation.futures; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.SpliceScore; +import org.opencb.biodata.models.core.SpliceScoreAlternate; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.ConsequenceType; +import org.opencb.biodata.models.variant.avro.SpliceScores; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.managers.VariantManager; +import org.opencb.commons.datastore.core.QueryOptions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.*; + +public class FutureSpliceScoreAnnotator implements Callable>> { + private List variantList; + private QueryOptions queryOptions; + private int dataRelease; + private String apiKey; + private VariantManager variantManager; + + private static Logger logger = LoggerFactory.getLogger(FutureSpliceScoreAnnotator.class); + + public FutureSpliceScoreAnnotator(List variantList, QueryOptions queryOptions, int dataRelease, String apiKey, + VariantManager variantManager) { + this.variantList = variantList; + this.queryOptions = queryOptions; + this.dataRelease = dataRelease; + this.apiKey = apiKey; + this.variantManager = variantManager; + } + + @Override + public List> call() throws Exception { + long startTime = System.currentTimeMillis(); + + List> cellBaseDataResultList = new ArrayList<>(variantList.size()); + + logger.debug("Query splice"); + // Want to return only one CellBaseDataResult object per Variant + for (Variant variant : variantList) { + cellBaseDataResultList.add(variantManager.getSpliceScoreVariant(variant, apiKey, dataRelease)); + } + logger.debug("Splice score query performance is {}ms for {} variants", System.currentTimeMillis() - startTime, + variantList.size()); + return cellBaseDataResultList; + } + + public void processResults(Future>> spliceFuture, + List variantAnnotationList) + throws InterruptedException, ExecutionException { + List> spliceCellBaseDataResults; + try { + spliceCellBaseDataResults = spliceFuture.get(30, TimeUnit.SECONDS); + } catch (TimeoutException e) { + spliceFuture.cancel(true); + throw new ExecutionException("Unable to finish splice score query on time", e); + } + + if (CollectionUtils.isNotEmpty(spliceCellBaseDataResults)) { + for (int i = 0; i < variantAnnotationList.size(); i++) { + CellBaseDataResult spliceScoreResult = spliceCellBaseDataResults.get(i); + if (spliceScoreResult != null && CollectionUtils.isNotEmpty(spliceScoreResult.getResults())) { + for (SpliceScore spliceScore : spliceScoreResult.getResults()) { + for (ConsequenceType ct : variantAnnotationList.get(i).getConsequenceTypes()) { + for (SpliceScoreAlternate spliceScoreAlt : spliceScore.getAlternates()) { + String alt = StringUtils.isEmpty(variantAnnotationList.get(i).getAlternate()) + ? "-" + : variantAnnotationList.get(i).getAlternate(); + if (alt.equals(spliceScoreAlt.getAltAllele())) { + if (StringUtils.isEmpty(spliceScore.getTranscriptId()) + || StringUtils.isEmpty(ct.getTranscriptId()) + || spliceScore.getTranscriptId().equals(ct.getTranscriptId())) { + SpliceScores scores = new SpliceScores(spliceScore.getSource(), spliceScoreAlt.getScores()); + if (ct.getSpliceScores() == null) { + ct.setSpliceScores(new ArrayList<>()); + } + ct.getSpliceScores().add(scores); + } + } + } + } + } + } + } + } + } +} From efa4824d0012cfac7837532e90c02087a869a16b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Aug 2024 11:59:20 +0200 Subject: [PATCH 128/148] lib: update the VariantAnnotationCalculator to support multi-species, #TASK-6426, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 2 + .../executors/ValidationCommandExecutor.java | 2 +- .../VariantAnnotationCommandExecutor.java | 4 +- .../cellbase/lib/managers/VariantManager.java | 8 +- .../VariantAnnotationCalculator.java | 109 +++++------------- .../core/VariantAnnotationCalculatorTest.java | 14 ++- .../lib/managers/DataReleaseManagerTest.java | 2 +- .../lib/variant/VariantManagerTest.java | 3 +- 8 files changed, 51 insertions(+), 93 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index e343949f0f..d373981407 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -92,6 +92,8 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO } else { loadOptions = loadCommandOptions.data.split(","); } + + if (loadCommandOptions.field != null) { field = loadCommandOptions.field; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java index 612e8d6a38..764de7b0df 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java @@ -78,7 +78,7 @@ public void execute() { DataRelease dataRelease = dataReleaseManager.get(validationCommandOptions.dataRelease); variantAnnotationCalculator = new VariantAnnotationCalculator(validationCommandOptions.species, validationCommandOptions.assembly, dataRelease, validationCommandOptions.apiKey, - cellBaseManagerFactory); + cellBaseManagerFactory, configuration); } catch (CellBaseException e) { e.printStackTrace(); return; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java index d2285d5550..731a7220ca 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java @@ -207,7 +207,7 @@ private boolean runAnnotation() throws Exception { DataReleaseManager dataReleaseManager = cellBaseManagerFactory.getDataReleaseManager(species, assembly); DataRelease dataRelease = dataReleaseManager.get(variantAnnotationCommandOptions.dataRelease); VariantAnnotationCalculator variantAnnotationCalculator = new VariantAnnotationCalculator(species, assembly, - dataRelease, variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory); + dataRelease, variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory, configuration); List> annotationByVariantList = variantAnnotationCalculator.getAnnotationByVariantList(variants, serverQueryOptions); @@ -485,7 +485,7 @@ private VariantAnnotator createCellBaseAnnotator() throws CellBaseException { DataReleaseManager dataReleaseManager = cellBaseManagerFactory.getDataReleaseManager(species, assembly); DataRelease dataRelease = dataReleaseManager.get(variantAnnotationCommandOptions.dataRelease); return new CellBaseLocalVariantAnnotator(new VariantAnnotationCalculator(species, assembly, dataRelease, - variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory), serverQueryOptions); + variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory, configuration), serverQueryOptions); } else { try { ClientConfiguration clientConfiguration = ClientConfiguration.load(getClass() diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java index 28f5c70fa7..72b32c2a5e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java @@ -28,6 +28,7 @@ import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.api.VariantQuery; +import org.opencb.cellbase.core.api.key.ApiKeyLicensedDataUtils; import org.opencb.cellbase.core.api.query.CellBaseQueryOptions; import org.opencb.cellbase.core.api.query.QueryException; import org.opencb.cellbase.core.config.CellBaseConfiguration; @@ -38,7 +39,6 @@ import org.opencb.cellbase.lib.impl.core.CellBaseCoreDBAdaptor; import org.opencb.cellbase.lib.impl.core.SpliceScoreMongoDBAdaptor; import org.opencb.cellbase.lib.impl.core.VariantMongoDBAdaptor; -import org.opencb.cellbase.core.api.key.ApiKeyLicensedDataUtils; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.cellbase.lib.variant.annotation.CellBaseNormalizerSequenceAdaptor; import org.opencb.cellbase.lib.variant.annotation.VariantAnnotationCalculator; @@ -96,7 +96,7 @@ public List> getHgvsByVariant(String variants, DataRe HgvsCalculator hgvsCalculator = new HgvsCalculator(genomeManager, dataRelease.getRelease()); List> results = new ArrayList<>(); VariantAnnotationCalculator variantAnnotationCalculator = new VariantAnnotationCalculator(species, assembly, - dataRelease, "", cellbaseManagerFactory); + dataRelease, "", cellbaseManagerFactory, configuration); List batchGeneList = variantAnnotationCalculator.getBatchGeneList(variantList); for (Variant variant : variantList) { List variantGeneList = variantAnnotationCalculator.getAffectedGenes(batchGeneList, variant); @@ -120,7 +120,7 @@ public CellBaseDataResult getNormalizationByVariant(String variants, bo DataRelease dataRelease) throws CellBaseException { List variantList = parseVariants(variants); VariantAnnotationCalculator variantAnnotationCalculator = new VariantAnnotationCalculator(species, assembly, - dataRelease, "", cellbaseManagerFactory); + dataRelease, "", cellbaseManagerFactory, configuration); // Set decompose MNV behaviour @@ -195,7 +195,7 @@ public List> getAnnotationByVariant(QueryO } VariantAnnotationCalculator variantAnnotationCalculator = new VariantAnnotationCalculator(species, assembly, - dataRelease, apiKey, cellbaseManagerFactory); + dataRelease, apiKey, cellbaseManagerFactory, configuration); List> queryResults = variantAnnotationCalculator.getAnnotationByVariantList(variantList, queryOptions); return queryResults; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java index a503ba7045..b033cb352f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java @@ -33,13 +33,16 @@ import org.opencb.cellbase.core.api.RepeatsQuery; import org.opencb.cellbase.core.api.query.LogicalList; import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.managers.*; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.cellbase.lib.variant.annotation.futures.FuturePharmacogenomicsAnnotator; +import org.opencb.cellbase.lib.variant.annotation.futures.FutureSpliceScoreAnnotator; import org.opencb.cellbase.lib.variant.hgvs.HgvsCalculator; import org.opencb.commons.datastore.core.QueryOptions; import org.slf4j.Logger; @@ -53,6 +56,7 @@ import static org.opencb.cellbase.core.ParamConstants.API_KEY_PARAM; import static org.opencb.cellbase.core.variant.PhasedQueryManager.*; +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by imedina on 06/02/16. @@ -74,11 +78,16 @@ public class VariantAnnotationCalculator { private RepeatsManager repeatsManager; private ProteinManager proteinManager; private PharmacogenomicsManager pharmacogenomicsManager; + private DataRelease dataRelease; private String apiKey; private Set annotatorSet; private List includeGeneFields; + private String species; + private String assembly; + private CellBaseConfiguration configuration; + private final VariantNormalizer normalizer; private boolean normalize = false; private boolean decompose = false; @@ -99,7 +108,14 @@ public class VariantAnnotationCalculator { private static Logger logger = LoggerFactory.getLogger(VariantAnnotationCalculator.class); public VariantAnnotationCalculator(String species, String assembly, DataRelease dataRelease, String apiKey, - CellBaseManagerFactory cellbaseManagerFactory) throws CellBaseException { + CellBaseManagerFactory cellbaseManagerFactory, CellBaseConfiguration configuration) + throws CellBaseException { + logger.debug("VariantAnnotationCalculator: in 'constructor'"); + + this.species = species; + this.assembly = assembly; + this.configuration = configuration; + this.genomeManager = cellbaseManagerFactory.getGenomeManager(species, assembly); this.variantManager = cellbaseManagerFactory.getVariantManager(species, assembly); this.geneManager = cellbaseManagerFactory.getGeneManager(species, assembly); @@ -118,9 +134,9 @@ public VariantAnnotationCalculator(String species, String assembly, DataRelease // at parseQueryParam this.normalizer = new VariantNormalizer(getNormalizerConfig()); - hgvsCalculator = new HgvsCalculator(genomeManager, this.dataRelease.getRelease()); + this.hgvsCalculator = new HgvsCalculator(genomeManager, this.dataRelease.getRelease()); + - logger.debug("VariantAnnotationMongoDBAdaptor: in 'constructor'"); } private VariantNormalizer.VariantNormalizerConfig getNormalizerConfig() { @@ -467,7 +483,7 @@ private List runAnnotationProcess(List normalizedVar FutureConservationAnnotator futureConservationAnnotator = null; Future>> conservationFuture = null; - if (annotatorSet.contains("conservation")) { + if (SpeciesUtils.hasData(configuration, species, CONSERVATION_DATA) && annotatorSet.contains("conservation")) { futureConservationAnnotator = new FutureConservationAnnotator(normalizedVariantList, QueryOptions.empty(), dataRelease.getRelease()); conservationFuture = CACHED_THREAD_POOL.submit(futureConservationAnnotator); @@ -475,7 +491,7 @@ private List runAnnotationProcess(List normalizedVar FutureVariantFunctionalScoreAnnotator futureVariantFunctionalScoreAnnotator = null; Future>> variantFunctionalScoreFuture = null; - if (annotatorSet.contains("functionalScore")) { + if (SpeciesUtils.hasData(configuration, species, VARIATION_FUNCTIONAL_SCORE_DATA) && annotatorSet.contains("functionalScore")) { futureVariantFunctionalScoreAnnotator = new FutureVariantFunctionalScoreAnnotator(normalizedVariantList, QueryOptions.empty(), dataRelease.getRelease()); variantFunctionalScoreFuture = CACHED_THREAD_POOL.submit(futureVariantFunctionalScoreAnnotator); @@ -484,7 +500,8 @@ private List runAnnotationProcess(List normalizedVar FutureClinicalAnnotator futureClinicalAnnotator = null; Future>> clinicalFuture = null; // FIXME "clinical" is deprecated, replaced with traitAssociation - if (annotatorSet.contains("clinical") || annotatorSet.contains("traitAssociation")) { + if (SpeciesUtils.hasData(configuration, species, CLINICAL_VARIANT_DATA) + && (annotatorSet.contains("clinical") || annotatorSet.contains("traitAssociation"))) { QueryOptions queryOptions = new QueryOptions(); queryOptions.add(ParamConstants.QueryParams.PHASE.key(), phased); queryOptions.add(ParamConstants.QueryParams.CHECK_AMINO_ACID_CHANGE.key(), checkAminoAcidChange); @@ -495,7 +512,7 @@ private List runAnnotationProcess(List normalizedVar FutureRepeatsAnnotator futureRepeatsAnnotator = null; Future>> repeatsFuture = null; - if (annotatorSet.contains("repeats")) { + if (SpeciesUtils.hasData(configuration, species, REPEATS_DATA) && annotatorSet.contains("repeats")) { futureRepeatsAnnotator = new FutureRepeatsAnnotator(normalizedVariantList, dataRelease.getRelease()); repeatsFuture = CACHED_THREAD_POOL.submit(futureRepeatsAnnotator); } @@ -509,15 +526,16 @@ private List runAnnotationProcess(List normalizedVar FutureSpliceScoreAnnotator futureSpliceScoreAnnotator = null; Future>> spliceScoreFuture = null; - if (annotatorSet.contains("consequenceType")) { + if (SpeciesUtils.hasData(configuration, species, SPLICE_SCORE_DATA) && annotatorSet.contains("consequenceType")) { futureSpliceScoreAnnotator = new FutureSpliceScoreAnnotator(normalizedVariantList, QueryOptions.empty(), - dataRelease.getRelease()); + dataRelease.getRelease(), apiKey, variantManager); spliceScoreFuture = CACHED_THREAD_POOL.submit(futureSpliceScoreAnnotator); } FuturePharmacogenomicsAnnotator futurePharmacogenomicsAnnotator = null; Future>> pharmacogenomicsFuture = null; - if (annotatorSet.contains("pharmacogenomics") && dataRelease.getCollections().containsKey(EtlCommons.PHARMACOGENOMICS_DATA)) { + if (SpeciesUtils.hasData(configuration, species, PHARMACOGENOMICS_DATA) && annotatorSet.contains("pharmacogenomics") + && dataRelease.getCollections().containsKey(EtlCommons.PHARMACOGENOMICS_DATA)) { futurePharmacogenomicsAnnotator = new FuturePharmacogenomicsAnnotator(normalizedVariantList, QueryOptions.empty(), dataRelease.getRelease(), pharmacogenomicsManager, logger); pharmacogenomicsFuture = CACHED_THREAD_POOL.submit(futurePharmacogenomicsAnnotator); @@ -1584,7 +1602,8 @@ public void processResults(Future>> variationFu } } - if (annotatorSet.contains("populationFrequencies") && preferredVariant != null) { + if (annotatorSet.contains("populationFrequencies") && preferredVariant != null + && preferredVariant.getAnnotation() != null) { variantAnnotationList.get(i) .setPopulationFrequencies(preferredVariant.getAnnotation().getPopulationFrequencies()); } @@ -1909,74 +1928,6 @@ public void processResults(Future>> cytobandFu } } - class FutureSpliceScoreAnnotator implements Callable>> { - private List variantList; - private QueryOptions queryOptions; - private int dataRelease; - - FutureSpliceScoreAnnotator(List variantList, QueryOptions queryOptions, int dataRelease) { - this.variantList = variantList; - this.queryOptions = queryOptions; - this.dataRelease = dataRelease; - } - - @Override - public List> call() throws Exception { - long startTime = System.currentTimeMillis(); - - List> cellBaseDataResultList = new ArrayList<>(variantList.size()); - - logger.debug("Query splice"); - // Want to return only one CellBaseDataResult object per Variant - for (Variant variant : variantList) { - cellBaseDataResultList.add(variantManager.getSpliceScoreVariant(variant, apiKey, dataRelease)); - } - logger.debug("Splice score query performance is {}ms for {} variants", System.currentTimeMillis() - startTime, - variantList.size()); - return cellBaseDataResultList; - } - - public void processResults(Future>> spliceFuture, - List variantAnnotationList) - throws InterruptedException, ExecutionException { - List> spliceCellBaseDataResults; - try { - spliceCellBaseDataResults = spliceFuture.get(30, TimeUnit.SECONDS); - } catch (TimeoutException e) { - spliceFuture.cancel(true); - throw new ExecutionException("Unable to finish splice score query on time", e); - } - - if (CollectionUtils.isNotEmpty(spliceCellBaseDataResults)) { - for (int i = 0; i < variantAnnotationList.size(); i++) { - CellBaseDataResult spliceScoreResult = spliceCellBaseDataResults.get(i); - if (spliceScoreResult != null && CollectionUtils.isNotEmpty(spliceScoreResult.getResults())) { - for (SpliceScore spliceScore : spliceScoreResult.getResults()) { - for (ConsequenceType ct : variantAnnotationList.get(i).getConsequenceTypes()) { - for (SpliceScoreAlternate spliceScoreAlt : spliceScore.getAlternates()) { - String alt = StringUtils.isEmpty(variantAnnotationList.get(i).getAlternate()) - ? "-" - : variantAnnotationList.get(i).getAlternate(); - if (alt.equals(spliceScoreAlt.getAltAllele())) { - if (StringUtils.isEmpty(spliceScore.getTranscriptId()) - || StringUtils.isEmpty(ct.getTranscriptId()) - || spliceScore.getTranscriptId().equals(ct.getTranscriptId())) { - SpliceScores scores = new SpliceScores(spliceScore.getSource(), spliceScoreAlt.getScores()); - if (ct.getSpliceScores() == null) { - ct.setSpliceScores(new ArrayList<>()); - } - ct.getSpliceScores().add(scores); - } - } - } - } - } - } - } - } - } - } - public VariantNormalizer getNormalizer() { return normalizer; } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java index b973f0b996..856d19c1c4 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java @@ -51,7 +51,8 @@ public class VariantAnnotationCalculatorTest extends GenericMongoDBAdaptorTest { public VariantAnnotationCalculatorTest() throws Exception { super(); - variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, apiKey, cellBaseManagerFactory); + variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, apiKey, cellBaseManagerFactory, + cellBaseConfiguration); jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); @@ -646,7 +647,7 @@ public void testCellBaseDataResultGroupingDecomposedMNVs() throws Exception { // Creating here a local VariantAnnotationCalculator since this test requires setting normalizer decompose // option to true which probably breaks some other tests. VariantAnnotationCalculator localScopeCalculator = new VariantAnnotationCalculator("hsapiens", "GRCh37", dataRelease, - apiKey, cellBaseManagerFactory); + apiKey, cellBaseManagerFactory, cellBaseConfiguration); // One MNV and one singleton SNV. Two CellBaseDataResults must be returned: first with two VariantAnnotation objects // and id corresponding to the original MNV call. Second with just one VariantAnnotation object. @@ -1041,7 +1042,8 @@ null, new Breakend(new BreakendMate("2", 10000, 10000 - 100, @Test public void testLicensedClinicalHGMDAnnotation() throws Exception { - variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, HGMD_ACCESS_API_KEY, cellBaseManagerFactory); + variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, HGMD_ACCESS_API_KEY, + cellBaseManagerFactory, cellBaseConfiguration); QueryOptions queryOptions = new QueryOptions("useCache", false); queryOptions.put("include", "clinical"); @@ -1060,7 +1062,8 @@ public void testLicensedClinicalHGMDAnnotation() throws Exception { @Test public void testLicensedClinicalHGMDandCOSMICAnnotation() throws Exception { - variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, HGMD_COSMIC_ACCESS_API_KEY, cellBaseManagerFactory); + variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, HGMD_COSMIC_ACCESS_API_KEY, + cellBaseManagerFactory, cellBaseConfiguration); QueryOptions queryOptions = new QueryOptions("useCache", false); queryOptions.put("include", "clinical"); @@ -1084,7 +1087,8 @@ public void testNoLicensedClinicalAnnotation() throws Exception { queryOptions.put("include", "clinical"); queryOptions.put("normalize", true); - variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, null, cellBaseManagerFactory); + variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, null, cellBaseManagerFactory, + cellBaseConfiguration); Variant variant = new Variant("10", 113588287, "G", "A"); CellBaseDataResult cellBaseDataResult = variantAnnotationCalculator diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/managers/DataReleaseManagerTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/managers/DataReleaseManagerTest.java index 5c0f687e62..aa13081bab 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/managers/DataReleaseManagerTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/managers/DataReleaseManagerTest.java @@ -152,7 +152,7 @@ public void testAnnotation() throws CellBaseException, QueryException, Execution DataRelease dataRelease = dataReleaseManager.get(1); VariantAnnotationCalculator annotator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, apiKey, - cellBaseManagerFactory); + cellBaseManagerFactory, cellBaseConfiguration); Variant variant = new Variant("10", 113588287, "G", "A"); CellBaseDataResult cellBaseDataResult = annotator.getAnnotationByVariant(variant, QueryOptions.empty()); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/variant/VariantManagerTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/variant/VariantManagerTest.java index 8d9c6467f9..60c4d59ff8 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/variant/VariantManagerTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/variant/VariantManagerTest.java @@ -46,7 +46,8 @@ public VariantManagerTest() throws CellBaseException { jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); - variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, apiKey, cellBaseManagerFactory); + variantAnnotationCalculator = new VariantAnnotationCalculator(SPECIES, ASSEMBLY, dataRelease, apiKey, cellBaseManagerFactory, + cellBaseConfiguration); variantManager = cellBaseManagerFactory.getVariantManager(SPECIES, ASSEMBLY); } From 4326fa32faa36c75d6e73357e2ca97e113d24cdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Aug 2024 18:29:54 +0200 Subject: [PATCH 129/148] lib: add log messages in protein builder, #TASK-5776, #TASK-5564 --- .../cellbase/lib/builders/ProteinBuilder.java | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index c7e490d424..186a0218b2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -122,16 +122,20 @@ public void parse() throws CellBaseException, IOException { } logger.info(PARSING_LOG_MESSAGE, interProFiles.get(0)); + String interproName = getDataName(INTERPRO_DATA); + int numLine = 0; + int numInterProLinesProcessed = 0; + int numUniqueProteinsProcessed = 0; try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interProFiles.get(0).toPath())) { + Set hashSet = proteinMap.keySet(); Set visited = new HashSet<>(proteinMap.size()); - int numInterProLinesProcessed = 0; - int numUniqueProteinsProcessed = 0; String[] fields; String line; boolean iprAdded; while ((line = interproBuffereReader.readLine()) != null) { + numLine++; fields = line.split("\t"); if (hashSet.contains(fields[0])) { @@ -183,24 +187,21 @@ public void parse() throws CellBaseException, IOException { visited.add(fields[0]); numUniqueProteinsProcessed++; } - } else { - logger.info("{} not found in protein map", fields[0]); } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.info("{} {} lines processed", numInterProLinesProcessed, getDataName(INTERPRO_DATA)); - logger.info("{} {} unique proteins processed", getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); + printInfoLogs(numInterProLinesProcessed, numUniqueProteinsProcessed, interproName); } } - logger.info("{} {} lines processed", numInterProLinesProcessed, getDataName(INTERPRO_DATA)); - logger.info("{} {} unique proteins processed", getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); + printInfoLogs(numInterProLinesProcessed, numUniqueProteinsProcessed, interproName); logger.info(PARSING_DONE_LOG_MESSAGE); } catch (IOException e) { - throw new CellBaseException("Error parsing " + getDataName(INTERPRO_DATA) + " file: " + interProFiles.get(0), e); + logger.error("Error parsing {} file: {}. Num. line = {}. Error stack trace = {}", interproName, interProFiles.get(0), + numLine, Arrays.toString(e.getStackTrace())); + printInfoLogs(numInterProLinesProcessed, numUniqueProteinsProcessed, interproName); } - // Serialize and save results RocksIterator rocksIterator = rocksDb.newIterator(); for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { @@ -279,4 +280,10 @@ private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOE private String getMismatchNumFilesErrorMessage(String dataName, int numFiles) { return "Only one " + dataName + " file is expected, but currently there are " + numFiles + " files"; } + + private void printInfoLogs(int numInterProLinesProcessed, int numUniqueProteinsProcessed, String dataName) { + logger.info("{}: {} lines processed", dataName, numInterProLinesProcessed); + logger.info("{}: {} unique proteins processed", dataName, numUniqueProteinsProcessed); + } + } From 2c7ddfb3c9be3cf9f66d35bb4ed7bcae85408f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 5 Aug 2024 08:21:25 +0200 Subject: [PATCH 130/148] lib: set variant ID in VariantBuilder, #TASK-5576, #TASK-5564 --- .../java/org/opencb/cellbase/lib/builders/VariationBuilder.java | 1 + 1 file changed, 1 insertion(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java index fc3d30dc25..e5eedf51b8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -83,6 +83,7 @@ private void parseVcf() throws IOException { Iterator iterator = variantVcfReader.iterator(); while (iterator.hasNext()) { Variant variant = iterator.next(); + variant.setId(variant.toStringSimple()); fileSerializer.serialize(variant, VARIATION_CHR_PREFIX + variant.getChromosome()); } variantVcfReader.close(); From 78211d08becb0b15592f2325b9627e3b9022c256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 5 Aug 2024 09:56:28 +0200 Subject: [PATCH 131/148] lib: remove System.exit, #TASK-5576, #TASK-5564 --- .../cellbase/app/cli/admin/executors/BuildCommandExecutor.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 89dc1cfc71..d7683c8e59 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -273,8 +273,6 @@ private AbstractBuilder buildGene() throws CellBaseException { return null; } - System.exit(-1); - copyVersionFiles(versionFiles, geneBuildPath); return new GeneBuilder(geneDownloadPath, geneBuildPath, speciesConfiguration, flexibleGTFParsing, configuration); From e0c6a13939cf2c38d7957ef82c03572565161caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 5 Aug 2024 17:19:23 +0200 Subject: [PATCH 132/148] lib: fix VariationBuilder by converting SV values from Ensembl to standard values, e.g. to , #TASK-5576, #TASK-5564 --- .../lib/builders/VariationBuilder.java | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java index e5eedf51b8..13ad0acbbc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -24,15 +24,15 @@ import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; +import org.opencb.commons.run.ParallelTaskRunner; import java.io.IOException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Iterator; -import java.util.Locale; +import java.util.*; +import java.util.stream.Collectors; -import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS; /** @@ -45,6 +45,24 @@ public class VariationBuilder extends AbstractBuilder { public static final String VARIATION_CHR_PREFIX = "variation_chr"; + public static final Map SV_VALUES_MAP; + + static { + Map tempMap = new HashMap<>(); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + SV_VALUES_MAP = Collections.unmodifiableMap(tempMap); + } + + public VariationBuilder(Path downloadPath, String species, CellBaseFileSerializer fileSerializer) { super(fileSerializer); @@ -74,22 +92,45 @@ private void parseVcf() throws IOException { try (DirectoryStream vcfPaths = Files.newDirectoryStream(downloadPath, entry -> entry.getFileName().toString().startsWith(prefix))) { for (Path vcfPath : vcfPaths) { - VariantStudyMetadata variantStudyMetadata = new VariantFileMetadata(vcfPath.getFileName().toString(), "") - .toVariantStudyMetadata(ENSEMBL_DATA); + + logger.info(PARSING_LOG_MESSAGE, vcfPath); + + VariantStudyMetadata variantStudyMetadata = new VariantFileMetadata(vcfPath.getFileName().toString(), + vcfPath.toAbsolutePath().toString()).toVariantStudyMetadata(""); VariantReader variantVcfReader = new VariantVcfHtsjdkReader(vcfPath, variantStudyMetadata, new VariantNormalizer(normalizerConfig)); // Write variant to the JSON files according to the chromosome + int count = 0; Iterator iterator = variantVcfReader.iterator(); while (iterator.hasNext()) { Variant variant = iterator.next(); - variant.setId(variant.toStringSimple()); + if (SV_VALUES_MAP.containsKey(variant.getAlternate())) { + variant.setAlternate(SV_VALUES_MAP.get(variant.getAlternate())); + } + variant.setId(variant.toString()); fileSerializer.serialize(variant, VARIATION_CHR_PREFIX + variant.getChromosome()); + if (++count % 1000000 == 0) { + logger.info("{} variants parsed", count); + } } variantVcfReader.close(); + + logger.info("{} variants parsed", count); + logger.info(PARSING_DONE_LOG_MESSAGE); } } fileSerializer.close(); } + + + + public class VariantReaderTask implements ParallelTaskRunner.TaskWithException { + + @Override + public List apply(List list) throws Exception { + return list.stream().map(v -> v.setId(v.toString())).collect(Collectors.toList()); + } + } } From 81e4cb174b01b4a29800bfc026e9128d743ac8a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 6 Aug 2024 09:16:23 +0200 Subject: [PATCH 133/148] lib: add new command 'data-list' to display the list of data supported by a given species, and update download options, #TASK-5575, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 24 ++++++-- .../cellbase/app/cli/admin/AdminMain.java | 3 + .../executors/DataListCommandExecutor.java | 56 +++++++++++++++++++ 3 files changed, 79 insertions(+), 4 deletions(-) create mode 100644 cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 55a446c4ea..aab79863ce 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -34,6 +34,7 @@ public class AdminCliOptionsParser extends CliOptionsParser { private DownloadCommandOptions downloadCommandOptions; private BuildCommandOptions buildCommandOptions; + private DataListCommandOptions dataListCommandOptions; private DataReleaseCommandOptions dataReleaseCommandOptions; private ApiKeyCommandOptions apiKeyCommandOptions; private LoadCommandOptions loadCommandOptions; @@ -50,6 +51,7 @@ public AdminCliOptionsParser() { downloadCommandOptions = new DownloadCommandOptions(); buildCommandOptions = new BuildCommandOptions(); + dataListCommandOptions = new DataListCommandOptions(); dataReleaseCommandOptions = new DataReleaseCommandOptions(); apiKeyCommandOptions = new ApiKeyCommandOptions(); loadCommandOptions = new LoadCommandOptions(); @@ -61,6 +63,7 @@ public AdminCliOptionsParser() { jCommander.addCommand("download", downloadCommandOptions); jCommander.addCommand("build", buildCommandOptions); + jCommander.addCommand("data-list", dataListCommandOptions); jCommander.addCommand("data-release", dataReleaseCommandOptions); jCommander.addCommand("api-key", apiKeyCommandOptions); jCommander.addCommand("load", loadCommandOptions); @@ -84,10 +87,9 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA - + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA - + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA - + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to download" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, @@ -131,6 +133,16 @@ public class BuildCommandOptions { } + @Parameters(commandNames = {"data-list"}, commandDescription = "List the data supported by the given species") + public class DataListCommandOptions { + + @ParametersDelegate + public CommonCommandOptions commonOptions = commonCommandOptions; + + @Parameter(names = {"-s", "--species"}, description = "Name of the species to list the data, valid formats include 'Homo sapiens' or 'hsapiens'", arity = 1) + public String species = "Homo sapiens"; + } + @Parameters(commandNames = {"data-release"}, commandDescription = "Manage data releases in order to support multiple versions of data") public class DataReleaseCommandOptions { @@ -394,6 +406,10 @@ public BuildCommandOptions getBuildCommandOptions() { return buildCommandOptions; } + public DataListCommandOptions getDataListCommandOptions() { + return dataListCommandOptions; + } + public DataReleaseCommandOptions getDataReleaseCommandOptions() { return dataReleaseCommandOptions; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index 06030ec485..d46d32709f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -64,6 +64,9 @@ public static void main(String[] args) { case "load": commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions()); break; + case "data-list": + commandExecutor = new DataListCommandExecutor(cliOptionsParser.getDataListCommandOptions()); + break; case "data-release": commandExecutor = new DataReleaseCommandExecutor(cliOptionsParser.getDataReleaseCommandOptions()); break; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java new file mode 100644 index 0000000000..8ec6a5e421 --- /dev/null +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java @@ -0,0 +1,56 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.app.cli.admin.executors; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.app.cli.CommandExecutor; +import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.utils.SpeciesUtils; + +import java.util.List; + +public class DataListCommandExecutor extends CommandExecutor { + + private AdminCliOptionsParser.DataListCommandOptions dataListCommandOptions; + + public DataListCommandExecutor(AdminCliOptionsParser.DataListCommandOptions dataListCommandOptions) { + super(dataListCommandOptions.commonOptions.logLevel, dataListCommandOptions.commonOptions.conf); + + this.dataListCommandOptions = dataListCommandOptions; + } + + + /** + * Execute one of the selected actions according to the input parameters. + */ + public void execute() { + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, dataListCommandOptions.species); + if (speciesConfiguration == null) { + System.out.println("Unknown species: " + dataListCommandOptions.species); + System.out.println("Available species:"); + List allSpecies = SpeciesUtils.getAllSpecies(configuration); + for (SpeciesConfiguration species : allSpecies) { + System.out.println("\t- " + species.getScientificName() + " (" + species.getId() + ")"); + } + return; + } + + System.out.println("Species: " + dataListCommandOptions.species); + System.out.println("Available data: " + StringUtils.join(speciesConfiguration.getData(), ",")); + } +} From 280fd67daf263ed664f8daa39f0125196f951432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 6 Aug 2024 09:54:33 +0200 Subject: [PATCH 134/148] app: update build options and fix sonnar issues, #TASK-5576, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index aab79863ce..657dc66f26 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -24,8 +24,6 @@ import java.util.List; import java.util.Map; -import static org.opencb.cellbase.lib.EtlCommons.*; - public class AdminCliOptionsParser extends CliOptionsParser { @@ -78,7 +76,8 @@ public void parse(String[] args) throws ParameterException { jCommander.parse(args); } - @Parameters(commandNames = {"download"}, commandDescription = "Download all different data sources provided in the configuration.yml file") + @Parameters(commandNames = {"download"}, commandDescription = "Download all different data sources provided in the configuration.yml" + + " file") public class DownloadCommandOptions { @ParametersDelegate @@ -103,19 +102,21 @@ public class BuildCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA - + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA - + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA - + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to build everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to build" + + " everything", required = true, arity = 1) public String data; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) + @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or" + + " 'hsapiens'", arity = 1) public String species = "Homo sapiens"; - @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml will be used", required = false, arity = 1) + @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml" + + " will be used", arity = 1) public String assembly; - @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) + @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, + arity = 1) public String outputDirectory; @Parameter(names = {"--skip-normalize"}, description = "Skip normalization of clinical variants. Normalization" @@ -139,7 +140,8 @@ public class DataListCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to list the data, valid formats include 'Homo sapiens' or 'hsapiens'", arity = 1) + @Parameter(names = {"-s", "--species"}, description = "Name of the species to list the data, valid formats include 'Homo sapiens'" + + " or 'hsapiens'", arity = 1) public String species = "Homo sapiens"; } @@ -161,11 +163,13 @@ public class DataReleaseCommandOptions { @Parameter(names = {"--update"}, description = "Data release to be updated by adding CellBase vesions", arity = 1) public int update; - @Parameter(names = {"--add-versions"}, description = "CellBase versions separated by commas, e.g.: v5.2,v5.3. This parameter has to be used together to the parameter --update", arity = 1) + @Parameter(names = {"--add-versions"}, description = "CellBase versions separated by commas, e.g.: v5.2,v5.3. This parameter has" + + " to be used together to the parameter --update", arity = 1) public String versions; } - @Parameters(commandNames = {"api-key"}, commandDescription = "Manage API keys in order to access to restricted/licensed data sources and set quota") + @Parameters(commandNames = {"api-key"}, commandDescription = "Manage API keys in order to access to restricted/licensed data sources" + + " and set quota") public class ApiKeyCommandOptions { @ParametersDelegate @@ -174,9 +178,9 @@ public class ApiKeyCommandOptions { @Parameter(names = {"--create-api-key"}, description = "Create an API key", arity = 0) public boolean createApiKey; - @Parameter(names = {"--licensed-data-sources"}, description = "Use this parameter in conjunction with --create-api-key to specify the" - + " licensed data sources separated by commas and optionally the expiration date: source[:dd/mm/yyyy]. e.g.:" - + " cosmic:31/01/2025,hgmd", arity = 1) + @Parameter(names = {"--licensed-data-sources"}, description = "Use this parameter in conjunction with --create-api-key to" + +" specify the licensed data sources separated by commas and optionally the expiration date: source[:dd/mm/yyyy]. e.g.:" + + " spliceai:31/01/2025,hgmd", arity = 1) public String dataSources; @Parameter(names = {"--expiration"}, description = "Use this parameter in conjunction with --create-api-key to specify the" @@ -349,16 +353,20 @@ public class ValidationCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens' or 'hsapiens'", arity = 1) + @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens'" + + " or 'hsapiens'", arity = 1) public String species = "Homo sapiens"; - @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json will be used", required = false, arity = 1) + @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json" + + " will be used", arity = 1) public String assembly = "GRCh38"; - @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter to 0", required = false, arity = 1) + @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter" + + " to 0", arity = 1) public int dataRelease = 0; - @Parameter(names = {"--api-key"}, description = "API key to get access to licensed/restricted data sources such as COSMIC or HGMD", required = false, arity = 1) + @Parameter(names = {"--api-key"}, description = "API key to get access to licensed/restricted data sources such as SpliceAI or" + + " HGMD", arity = 1) public String apiKey; @Parameter(names = {"-i", "--input-file"}, description = "Full path to VCF", required = true, arity = 1) @@ -367,8 +375,7 @@ public class ValidationCommandOptions { @Parameter(names = {"-V", "--vep-file"}, description = "Full path to VEP annotation JSON file", required = true, arity = 1) public String vepFile; - @Parameter(names = {"-o", "--output-dir"}, description = "Output directory where the comparison report is saved", required = false, - arity = 1) + @Parameter(names = {"-o", "--output-dir"}, description = "Output directory where the comparison report is saved", arity = 1) public String outputDirectory = "/tmp"; @Parameter(names = {"-t", "--type"}, description = "Which type to analyse: 'Protein', 'Transcript' or 'Both'", required = From 2235e5c13eedd60d26daa03bcdd5db60a801a075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 6 Aug 2024 09:58:42 +0200 Subject: [PATCH 135/148] app: update CLI option descriptions for loading, exporting, indexing..., #TASK-6142, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 657dc66f26..894e3b0b5e 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -205,9 +205,9 @@ public class LoadCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation," - + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics." - + " 'all' loads everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to load, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to load" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"-i", "--input"}, required = true, arity = 1, @@ -252,9 +252,9 @@ public class ExportCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, " - + "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' " - + " loads everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to export, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to export" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"--db", "--database"}, description = "Database name, e.g., cellbase_hsapiens_grch38_v5", required = true, @@ -314,10 +314,9 @@ public class IndexCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be indexed: genome, gene, variation, " - + "regulation, protein, ontology, clinical_variants, repeats, refseq and missense_variation_functional_score. 'all' " - + "indexes everything", required = true, - arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to index, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to index" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"--db", "--database"}, description = "Database name.", required = true, arity = 1) From 6a4c16a7b93dfc473ae522e058c5d851ebc5c4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 7 Aug 2024 10:37:40 +0200 Subject: [PATCH 136/148] test: update JUnit tests according to the latest changes, #TASK-5564 --- cellbase-core/src/test/resources/configuration.yml | 5 ----- .../src/test/resources/configuration.test.yaml | 5 ----- .../{ => motif_features}/motifFeaturesVersion.json | 0 .../{ => motif_features}/motif_features.gff.gz | Bin .../{ => motif_features}/motif_features.gff.gz.tbi | Bin .../Regulatory_Build.regulatory_features.gff.gz | Bin .../regulatoryBuildVersion.json | 0 7 files changed, 10 deletions(-) rename cellbase-lib/src/test/resources/regulation/{ => motif_features}/motifFeaturesVersion.json (100%) rename cellbase-lib/src/test/resources/regulation/{ => motif_features}/motif_features.gff.gz (100%) rename cellbase-lib/src/test/resources/regulation/{ => motif_features}/motif_features.gff.gz.tbi (100%) rename cellbase-lib/src/test/resources/regulation/{ => regulatory_build}/Regulatory_Build.regulatory_features.gff.gz (100%) rename cellbase-lib/src/test/resources/regulation/{ => regulatory_build}/regulatoryBuildVersion.json (100%) diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index dc7901d8d5..9c03fdec32 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -135,11 +135,6 @@ download: version: "2024-04-26" files: HPO: "manual@phenotype_to_genes.txt" - disgenet: - host: https://www.disgenet.org/ - version: "7.0 (January 2020)" - files: - DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz gnomadConstraints: host: https://storage.googleapis.com/ version: "2.1.1" diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index f5c08b498a..941928b876 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -122,11 +122,6 @@ download: version: "2024-04-26" files: HPO: "manual@phenotype_to_genes.txt" - disgenet: - host: https://www.disgenet.org/ - version: "7.0 (January 2020)" - files: - DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz gnomadConstraints: host: https://storage.googleapis.com/ version: "2.1.1" diff --git a/cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json b/cellbase-lib/src/test/resources/regulation/motif_features/motifFeaturesVersion.json similarity index 100% rename from cellbase-lib/src/test/resources/regulation/motifFeaturesVersion.json rename to cellbase-lib/src/test/resources/regulation/motif_features/motifFeaturesVersion.json diff --git a/cellbase-lib/src/test/resources/regulation/motif_features.gff.gz b/cellbase-lib/src/test/resources/regulation/motif_features/motif_features.gff.gz similarity index 100% rename from cellbase-lib/src/test/resources/regulation/motif_features.gff.gz rename to cellbase-lib/src/test/resources/regulation/motif_features/motif_features.gff.gz diff --git a/cellbase-lib/src/test/resources/regulation/motif_features.gff.gz.tbi b/cellbase-lib/src/test/resources/regulation/motif_features/motif_features.gff.gz.tbi similarity index 100% rename from cellbase-lib/src/test/resources/regulation/motif_features.gff.gz.tbi rename to cellbase-lib/src/test/resources/regulation/motif_features/motif_features.gff.gz.tbi diff --git a/cellbase-lib/src/test/resources/regulation/Regulatory_Build.regulatory_features.gff.gz b/cellbase-lib/src/test/resources/regulation/regulatory_build/Regulatory_Build.regulatory_features.gff.gz similarity index 100% rename from cellbase-lib/src/test/resources/regulation/Regulatory_Build.regulatory_features.gff.gz rename to cellbase-lib/src/test/resources/regulation/regulatory_build/Regulatory_Build.regulatory_features.gff.gz diff --git a/cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json b/cellbase-lib/src/test/resources/regulation/regulatory_build/regulatoryBuildVersion.json similarity index 100% rename from cellbase-lib/src/test/resources/regulation/regulatoryBuildVersion.json rename to cellbase-lib/src/test/resources/regulation/regulatory_build/regulatoryBuildVersion.json From 68c9f43bd3513029c9cbc1618149249a246e491c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 7 Aug 2024 16:42:02 +0200 Subject: [PATCH 137/148] lib: improve variation builder by setting xref and annotation, and removing study info, #TASK-5576, #TASK-5564 --- .../lib/builders/VariationBuilder.java | 72 +++++++++++++++---- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java index 13ad0acbbc..e4b9171152 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -16,22 +16,24 @@ package org.opencb.cellbase.lib.builders; +import org.apache.commons.collections4.MapUtils; import org.opencb.biodata.formats.variant.io.VariantReader; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.avro.AdditionalAttribute; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.biodata.models.variant.avro.Xref; import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import org.opencb.commons.run.ParallelTaskRunner; import java.io.IOException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS; @@ -44,6 +46,11 @@ public class VariationBuilder extends AbstractBuilder { private String species; public static final String VARIATION_CHR_PREFIX = "variation_chr"; + public static final String VCF_ID_KEY = "VCF_ID"; + public static final String EVA_PREFIX = "EVA_"; + public static final String RS_PREFIX = "rs"; + + private static final String VARIANTS_PARSED_LOG_MESSAGE = "{} variants parsed"; public static final Map SV_VALUES_MAP; @@ -92,7 +99,6 @@ private void parseVcf() throws IOException { try (DirectoryStream vcfPaths = Files.newDirectoryStream(downloadPath, entry -> entry.getFileName().toString().startsWith(prefix))) { for (Path vcfPath : vcfPaths) { - logger.info(PARSING_LOG_MESSAGE, vcfPath); VariantStudyMetadata variantStudyMetadata = new VariantFileMetadata(vcfPath.getFileName().toString(), @@ -105,32 +111,68 @@ private void parseVcf() throws IOException { Iterator iterator = variantVcfReader.iterator(); while (iterator.hasNext()) { Variant variant = iterator.next(); + // Convert alternate for structural variants if (SV_VALUES_MAP.containsKey(variant.getAlternate())) { variant.setAlternate(SV_VALUES_MAP.get(variant.getAlternate())); } + // Set variant ID (after converting the alternate) variant.setId(variant.toString()); + // Set variant annotation: chrom, start, end, ref, alt, xrefs and additional attributes + VariantAnnotation variantAnnotation = new VariantAnnotation(); + variantAnnotation.setChromosome(variant.getChromosome()); + variantAnnotation.setStart(variant.getStart()); + variantAnnotation.setEnd(variant.getEnd()); + variantAnnotation.setReference(variant.getReference()); + variantAnnotation.setAlternate(variant.getAlternate()); + try { + Xref xref = null; + Map attributes = new HashMap<>(); + Map data = variant.getStudies().get(0).getFiles().get(0).getData(); + for (Map.Entry entry : data.entrySet()) { + if (entry.getKey().startsWith(EVA_PREFIX)) { + if (xref == null && data.containsKey(VCF_ID_KEY) && data.get(VCF_ID_KEY).startsWith(RS_PREFIX)) { + xref = new Xref(data.get(VCF_ID_KEY), entry.getKey()); + } + } else if (!entry.getKey().equals(VCF_ID_KEY)) { + attributes.put(entry.getKey(), entry.getValue()); + } + } + if (xref != null) { + variantAnnotation.setXrefs(Collections.singletonList(xref)); + } + if (MapUtils.isNotEmpty(attributes)) { + AdditionalAttribute additionalAttribute = new AdditionalAttribute(attributes); + Map additionalAttributeMap = new HashMap<>(); + additionalAttributeMap.put(vcfPath.getFileName().toString(), additionalAttribute); + variantAnnotation.setAdditionalAttributes(additionalAttributeMap); + } + } catch (Exception e) { + logger.warn("Error setting annotation for variant {}: {}", variant.getId(), Arrays.toString(e.getStackTrace())); + } + if (variantAnnotation != null) { + variant.setAnnotation(variantAnnotation); + } + variant.setAnnotation(variantAnnotation); + + // Remove study info + variant.setStudies(null); + + // Serialize fileSerializer.serialize(variant, VARIATION_CHR_PREFIX + variant.getChromosome()); if (++count % 1000000 == 0) { - logger.info("{} variants parsed", count); + logger.info(VARIANTS_PARSED_LOG_MESSAGE, count); + } + if (count > 100) { + break; } } variantVcfReader.close(); - logger.info("{} variants parsed", count); + logger.info(VARIANTS_PARSED_LOG_MESSAGE, count); logger.info(PARSING_DONE_LOG_MESSAGE); } } fileSerializer.close(); } - - - - public class VariantReaderTask implements ParallelTaskRunner.TaskWithException { - - @Override - public List apply(List list) throws Exception { - return list.stream().map(v -> v.setId(v.toString())).collect(Collectors.toList()); - } - } } From 914b9c1125536fdfa4534d86a10d6e64332a8f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 7 Aug 2024 16:47:56 +0200 Subject: [PATCH 138/148] lib: remove break for testing, #TASK-5576, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/VariationBuilder.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java index e4b9171152..9d52cffe96 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -162,9 +162,6 @@ private void parseVcf() throws IOException { if (++count % 1000000 == 0) { logger.info(VARIANTS_PARSED_LOG_MESSAGE, count); } - if (count > 100) { - break; - } } variantVcfReader.close(); From 3538e14e35db266b2f2210edb078320605c45365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 7 Aug 2024 17:33:01 +0200 Subject: [PATCH 139/148] core: add ontology data into configuration file for "mus musculus" and update ontology downloader to take it into account, #TASK-5575, #TASK-5564 --- .../src/main/resources/configuration.yml | 1 + .../src/test/resources/configuration.yml | 3 +- .../lib/download/OntologyDownloadManager.java | 43 ++++++++++--------- .../test/resources/configuration.test.yaml | 3 +- 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index b84fe7d399..f1cb6daee5 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -348,6 +348,7 @@ species: - regulation - protein - variation + - ontology - id: rnorvegicus scientificName: Rattus norvegicus assemblies: diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 9c03fdec32..191d19e08c 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -349,7 +349,8 @@ species: - gene - regulation - protein - # - variation + - variation + - ontology - id: rnorvegicus scientificName: Rattus norvegicus assemblies: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index c3048c554c..a033403bf9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -51,14 +51,31 @@ public List download() throws IOException, InterruptedException, C Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); Files.createDirectories(oboFolder); + String version; DownloadFile downloadFile; - // HPO - downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); - String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); - downloadFiles.add(downloadFile); + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + // HPO + downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); + downloadFiles.add(downloadFile); + + // DOID + downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); + downloadFiles.add(downloadFile); + + // Mondo + downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); + downloadFiles.add(downloadFile); + } // GO downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); @@ -67,20 +84,6 @@ public List download() throws IOException, InterruptedException, C oboFolder.resolve(getDataVersionFilename(GO_OBO_DATA))); downloadFiles.add(downloadFile); - // DOID - downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); - downloadFiles.add(downloadFile); - - // Mondo - downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); - downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); } diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index 941928b876..cff46222d1 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -336,7 +336,8 @@ species: - gene - regulation - protein - # - variation + - variation + - ontology - id: rnorvegicus scientificName: Rattus norvegicus assemblies: From d0d92a32ac23db9ba8f36ba6ced4f359f10a50db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 8 Aug 2024 07:57:34 +0200 Subject: [PATCH 140/148] lib: update ontology downloader and take into account multi-species support, #TASK-5575, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 1 - .../lib/download/OntologyDownloadManager.java | 31 ++++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index d3b3147d38..adca2e3178 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -228,7 +228,6 @@ public final class EtlCommons { // Ontology public static final String ONTOLOGY_DATA = "ontology"; - public static final String OBO_BASENAME = "ontology"; // HPO public static final String HPO_OBO_DATA = "hpo"; // Must match the configuration file diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index a033403bf9..cabfd2339b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -56,32 +56,39 @@ public List download() throws IOException, InterruptedException, C if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { // HPO - downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + Files.createDirectories(oboFolder.resolve(HPO_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, + oboFolder.resolve(HPO_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(HPO_OBO_DATA).resolve(downloadFile.getOutputFile())); saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); + oboFolder.resolve(HPO_OBO_DATA).resolve(getDataVersionFilename(HPO_OBO_DATA))); downloadFiles.add(downloadFile); // DOID - downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + Files.createDirectories(oboFolder.resolve(DOID_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, + oboFolder.resolve(DOID_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(DOID_OBO_DATA).resolve(downloadFile.getOutputFile())); saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); + oboFolder.resolve(DOID_OBO_DATA).resolve(getDataVersionFilename(DOID_OBO_DATA))); downloadFiles.add(downloadFile); // Mondo - downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + Files.createDirectories(oboFolder.resolve(MONDO_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, + oboFolder.resolve(MONDO_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(MONDO_OBO_DATA).resolve(downloadFile.getOutputFile())); saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); + oboFolder.resolve(MONDO_OBO_DATA).resolve(getDataVersionFilename(MONDO_OBO_DATA))); downloadFiles.add(downloadFile); } // GO - downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); - version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + Files.createDirectories(oboFolder.resolve(GO_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder.resolve(GO_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(GO_OBO_DATA).resolve(downloadFile.getOutputFile())); saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(getDataVersionFilename(GO_OBO_DATA))); + oboFolder.resolve(GO_OBO_DATA).resolve(getDataVersionFilename(GO_OBO_DATA))); downloadFiles.add(downloadFile); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); From d51114babf2a08673da606e6c3a2636a12831944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 8 Aug 2024 07:59:19 +0200 Subject: [PATCH 141/148] lib: update ontology builder and take into account multi-species support, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 35 +++++++++++--- .../lib/builders/OntologyBuilder.java | 46 ++++++++++++------- 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index d7683c8e59..4215664663 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -46,6 +46,7 @@ import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; +import static org.opencb.cellbase.lib.builders.OntologyBuilder.OBO_OUTPUT_BASENAME; import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; @@ -307,17 +308,37 @@ private AbstractBuilder buildRepeats() throws CellBaseException { } private AbstractBuilder buildObo() throws CellBaseException { + // Sanity check Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_DATA); Path oboBuildPath = buildFolder.resolve(ONTOLOGY_DATA); - List versionPaths = Arrays.asList(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), - oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), - oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), - oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA))); - copyVersionFiles(versionPaths, oboBuildPath); + List filesToCheck = new ArrayList<>(Arrays.asList(oboBuildPath.resolve(OBO_OUTPUT_BASENAME))); + List dataList = new ArrayList<>(Arrays.asList(GO_OBO_DATA)); + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + dataList.add(HPO_OBO_DATA); + dataList.add(DOID_OBO_DATA); + dataList.add(MONDO_OBO_DATA); + } + + for (String data : dataList) { + filesToCheck.add(oboBuildPath.resolve(data).resolve(getDataVersionFilename(data))); + } + + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(ONTOLOGY_DATA)); + return null; + } + + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(oboDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + for (String data : dataList) { + copyVersionFiles(Collections.singletonList(oboDownloadPath.resolve(data).resolve(getDataVersionFilename(data))), + oboBuildPath); + } // Create serializer and return the ontology builder - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(oboBuildPath, OBO_BASENAME); - return new OntologyBuilder(oboDownloadPath, serializer); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(oboBuildPath, OBO_OUTPUT_BASENAME); + return new OntologyBuilder(oboDownloadPath, speciesConfiguration, serializer); } private AbstractBuilder buildCadd() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 34710bfe3a..9273c451f5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -19,6 +19,7 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; +import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; @@ -27,6 +28,7 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -34,35 +36,44 @@ public class OntologyBuilder extends AbstractBuilder { private Path oboDownloadPath; + private SpeciesConfiguration speciesConfiguration; - public OntologyBuilder(Path oboDownloadPath, CellBaseSerializer serializer) { + public static final String OBO_OUTPUT_BASENAME = "ontology"; + public static final String OBO_OUTPUT_FILENAME = OBO_OUTPUT_BASENAME + ".json.gz"; + + public OntologyBuilder(Path oboDownloadPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { super(serializer); + this.oboDownloadPath = oboDownloadPath; + this.speciesConfiguration = speciesConfiguration; } @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); - // Sanity check - checkDirectory(oboDownloadPath, getDataName(REGULATION_DATA)); + checkDirectory(oboDownloadPath, getDataName(ONTOLOGY_DATA)); // Check ontology files - List hpoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), getDataName(HPO_OBO_DATA)); - List goFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), getDataName(GO_OBO_DATA)); - List doidFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), getDataName(DOID_OBO_DATA)); - List mondoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA)), getDataName(MONDO_OBO_DATA)); + List hpoFiles = Collections.emptyList(); + List doidFiles = Collections.emptyList(); + List mondoFiles = Collections.emptyList(); + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + hpoFiles = checkOboFiles(HPO_OBO_DATA); + doidFiles = checkOboFiles(DOID_OBO_DATA); + mondoFiles = checkOboFiles(MONDO_OBO_DATA); + } + List goFiles = checkOboFiles(GO_OBO_DATA); // Parse OBO files and build - parseOboFile(hpoFiles.get(0), HPO_OBO_DATA); + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + parseOboFile(hpoFiles.get(0), HPO_OBO_DATA); + parseOboFile(doidFiles.get(0), DOID_OBO_DATA); + parseOboFile(mondoFiles.get(0), MONDO_OBO_DATA); + } parseOboFile(goFiles.get(0), GO_OBO_DATA); - parseOboFile(doidFiles.get(0), DOID_OBO_DATA); - parseOboFile(mondoFiles.get(0), MONDO_OBO_DATA); // Close serializer serializer.close(); - - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); } private void parseOboFile(File oboFile, String data) throws IOException { @@ -77,9 +88,12 @@ private void parseOboFile(File oboFile, String data) throws IOException { logger.info(PARSING_DONE_LOG_MESSAGE, oboFile); } - private List checkOboFiles(Path versionFilePath, String name) throws IOException, CellBaseException { - List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, getDataName(ONTOLOGY_DATA) - + "/" + name); + private List checkOboFiles(String data) throws IOException, CellBaseException { + Path versionFilePath = oboDownloadPath.resolve(data).resolve(getDataVersionFilename(data)); + String name = getDataName(data); + + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath.resolve(data), + getDataName(ONTOLOGY_DATA) + "/" + name); if (files.size() != 1) { throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); } From 24450d383fa2236de185d4690edcbf5f155631b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 8 Aug 2024 07:59:53 +0200 Subject: [PATCH 142/148] app: update load command executor for ontology data according to the latest changes, #TASK-6142, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index d373981407..a99cbd4ea2 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -44,6 +44,7 @@ import static org.opencb.cellbase.lib.EtlCommons.*; import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; +import static org.opencb.cellbase.lib.builders.OntologyBuilder.OBO_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; @@ -205,19 +206,7 @@ public void execute() throws CellBaseException { // loadStructuralVariants(); // break; case EtlCommons.ONTOLOGY_DATA: { - // Load data - loadIfExists(input.resolve("ontology.json.gz"), "ontology"); - - // Create index - createIndex("ontology"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.HPO_VERSION_FILE), - input.resolve(EtlCommons.GO_VERSION_FILE), - input.resolve(EtlCommons.DO_VERSION_FILE) - )); - dataReleaseManager.update(dataRelease, "ontology", sources); + loadOntology(); break; } case EtlCommons.SPLICE_SCORE_DATA: { @@ -396,6 +385,13 @@ private void loadRepeats() throws CellBaseException { loadData(input.resolve(REPEATS_DATA), collectionMap); } + private void loadOntology() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(ONTOLOGY_DATA, OBO_OUTPUT_FILENAME); + + loadData(input.resolve(ONTOLOGY_DATA), collectionMap); + } + private void loadRegulation() throws CellBaseException { HashMap collectionMap = new HashMap<>(); collectionMap.put(REGULATORY_REGION_BASENAME, REGULATORY_REGION_OUTPUT_FILENAME); From 132382d0f3c23b64c9365408891ea939091b1fdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 8 Aug 2024 09:11:47 +0200 Subject: [PATCH 143/148] app: check data according to the species before loading data, #TASK-6142, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 150 +++++++++--------- .../core/utils/DatabaseNameUtils.java | 72 +++++++++ .../cellbase/lib/db/MongoDBManager.java | 28 +--- .../lib/GenericMongoDBAdaptorTest.java | 4 +- .../impl/core/MongoDBAdaptorFactoryTest.java | 9 +- 5 files changed, 160 insertions(+), 103 deletions(-) create mode 100644 cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index a99cbd4ea2..82e9faae59 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -20,9 +20,12 @@ import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; +import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.utils.DatabaseNameUtils; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.indexer.IndexManager; @@ -62,7 +65,7 @@ public class LoadCommandExecutor extends CommandExecutor { private AdminCliOptionsParser.LoadCommandOptions loadCommandOptions; private Path input; - private String[] loadOptions; + private List dataList; private int dataRelease; private String database; @@ -78,33 +81,6 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO super(loadCommandOptions.commonOptions.logLevel, loadCommandOptions.commonOptions.conf); this.loadCommandOptions = loadCommandOptions; - - input = Paths.get(loadCommandOptions.input); - if (loadCommandOptions.database != null) { - database = loadCommandOptions.database; - } - if (loadCommandOptions.data.equals("all")) { - loadOptions = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, - EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, - EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, - EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, - PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; - } else { - loadOptions = loadCommandOptions.data.split(","); - } - - - if (loadCommandOptions.field != null) { - field = loadCommandOptions.field; - } - if (loadCommandOptions.innerFields != null) { - innerFields = loadCommandOptions.innerFields.split(","); - } - if (loadCommandOptions.loader != null) { - loader = loadCommandOptions.loader; - } - createIndexes = !loadCommandOptions.skipIndex; } /** @@ -113,13 +89,10 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO * @throws CellBaseException CellBase exception */ public void execute() throws CellBaseException { - // Init release manager - dataReleaseManager = new DataReleaseManager(database, configuration); - checkParameters(); logger.info("Loading in data release {}", dataRelease); - if (loadCommandOptions.data != null) { + if (CollectionUtils.isNotEmpty(dataList)) { // If 'authenticationDatabase' is not passed by argument then we read it from configuration.json if (loadCommandOptions.loaderParams.containsKey("authenticationDatabase")) { configuration.getDatabases().getMongodb().getOptions().put("authenticationDatabase", @@ -131,9 +104,9 @@ public void execute() throws CellBaseException { indexManager = new IndexManager(database, indexFile, configuration); } - for (String loadOption : loadOptions) { + for (String data : dataList) { try { - switch (loadOption) { + switch (data) { case EtlCommons.GENOME_DATA: { loadGenome(); break; @@ -202,9 +175,6 @@ public void execute() throws CellBaseException { loadRepeats(); break; } -// case EtlCommons.STRUCTURAL_VARIANTS_DATA: -// loadStructuralVariants(); -// break; case EtlCommons.ONTOLOGY_DATA: { loadOntology(); break; @@ -225,7 +195,7 @@ public void execute() throws CellBaseException { break; } default: - logger.warn("Not valid 'data'. We should not reach this point"); + logger.warn("Not valid data: {}. We should not reach this point", data); break; } } catch (IllegalAccessException | InstantiationException | InvocationTargetException | ExecutionException @@ -236,20 +206,6 @@ public void execute() throws CellBaseException { } } -// private void loadStructuralVariants() { -// Path path = input.resolve(EtlCommons.STRUCTURAL_VARIANTS_JSON + ".json.gz"); -// if (Files.exists(path)) { -// try { -// logger.debug("Loading '{}' ...", path.toString()); -// loadRunner.load(path, EtlCommons.STRUCTURAL_VARIANTS_DATA); -// loadIfExists(input.resolve(EtlCommons.DGV_VERSION_FILE), "metadata"); -// } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException -// | IllegalAccessException | ExecutionException | IOException | InterruptedException e) { -// logger.error(e.toString()); -// } -// } -// } - private void loadIfExists(Path path, String collection) throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IOException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, LoaderException, CellBaseException { @@ -266,31 +222,83 @@ private void loadIfExists(Path path, String collection) throws NoSuchMethodExcep } private void checkParameters() throws CellBaseException { - if (loadCommandOptions.numThreads > 1) { - numThreads = loadCommandOptions.numThreads; + // Input folder + if (!Files.exists(Paths.get(loadCommandOptions.input))) { + throw new CellBaseException("Input path '" + loadCommandOptions.input + "' does not exist"); + } + if (!Files.isDirectory(Paths.get(loadCommandOptions.input))) { + throw new CellBaseException("Input path '" + loadCommandOptions.input + "' is not a directyory"); + } + input = Paths.get(loadCommandOptions.input); + + // Database + if (StringUtils.isEmpty(loadCommandOptions.database)) { + throw new CellBaseException("Missing database"); + } + database = loadCommandOptions.database; + + // Data + if (StringUtils.isEmpty(loadCommandOptions.data)) { + throw new CellBaseException("Missing data. Please, specify a list of data separated by commas, or use 'all' to load" + + " everything"); + } + String species = DatabaseNameUtils.getSpeciesFromDatabaseName(database); + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); + if (speciesConfiguration == null) { + throw new CellBaseException("Species '" + species + "' not supported (database name '" + database + "')"); + } + if (loadCommandOptions.data.equals("all")) { + dataList = speciesConfiguration.getData(); } else { - numThreads = 1; - logger.warn("Incorrect number of numThreads, it must be a positive value. This has been set to '{}'", numThreads); + dataList = Arrays.asList(loadCommandOptions.data.split(",")); + if (CollectionUtils.isEmpty(dataList)) { + throw new CellBaseException("Missing data. Please, specify a list of data separated by commas, or use 'all' to load" + + " everything"); + } + Set invalidData = new HashSet<>(); + for (String data : dataList) { + if (!speciesConfiguration.getData().contains(data)) { + invalidData.add(data); + } + } + if (!CollectionUtils.isEmpty(invalidData)) { + throw new CellBaseException("Data '" + StringUtils.join(invalidData, ",") + "' not supported by species '" + species + "'"); + } + } + + // Field + if (StringUtils.isNotEmpty(loadCommandOptions.field)) { + field = loadCommandOptions.field; + } + + // Inner fields + if (StringUtils.isNotEmpty(loadCommandOptions.innerFields)) { + innerFields = loadCommandOptions.innerFields.split(","); } - if (field != null) { - if (loadCommandOptions.data == null) { - logger.error("--data option cannot be empty. Please provide a valid value for the --data parameter."); - } else if (!Files.exists(input)) { - logger.error("Input parameter {} does not exist", input); + // Loader + if (StringUtils.isNotEmpty(loadCommandOptions.loader)) { + loader = loadCommandOptions.loader; + try { + Class.forName(loader); + } catch (ClassNotFoundException e) { + throw new CellBaseException("Loader Java class '" + loader + "' does not exist", e); } - } else if (!Files.exists(input) || !Files.isDirectory(input)) { - logger.error("Input parameter {} does not exist or is not a directory", input); } - try { - Class.forName(loader); - } catch (ClassNotFoundException e) { - logger.error("Loader Java class '{}' does not exist", loader); - e.printStackTrace(); - System.exit(-1); + + // Skip indexes + createIndexes = !loadCommandOptions.skipIndex; + + // Num. threads + if (loadCommandOptions.numThreads > 1) { + numThreads = loadCommandOptions.numThreads; + } else { + numThreads = 1; + logger.warn("Incorrect number of numThreads, it must be a positive value. This has been set to '{}'", numThreads); } - // Check data release + // Data release + dataReleaseManager = new DataReleaseManager(database, configuration); dataRelease = getDataReleaseForLoading(dataReleaseManager).getRelease(); } @@ -304,7 +312,7 @@ private void loadVariation() throws NoSuchMethodException, InterruptedException, } else { // Custom update required e.g. population freqs loading logger.info("Loading file '{}'", input); - loadRunner.load(input, "variation", dataRelease, field, innerFields); + loadRunner.load(input, VARIATION_DATA, dataRelease, field, innerFields); } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java new file mode 100644 index 0000000000..12954e950f --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java @@ -0,0 +1,72 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.utils; + +import org.apache.commons.lang3.StringUtils; + +import java.security.InvalidParameterException; +import java.util.Locale; + + +public final class DatabaseNameUtils { + + public static final String DBNAME_PREFIX = "cellbase"; + public static final String DBNAME_SEPARATOR = "_"; + + private DatabaseNameUtils() { + throw new IllegalStateException("Utility class"); + } + + public static String getDatabaseName(String species, String assembly, String version) { + if (StringUtils.isEmpty(species) || StringUtils.isEmpty(assembly)) { + throw new InvalidParameterException("Both species and assembly are required"); + } + + // Remove special characters + String dbnameAssembly = cleanAssembly(assembly); + + // Process version from the configuration file, in order to suffix the database name + // - Production environment, e.g.: if version is "v5", the suffix added wil be "_v5" + // - Test environment, e.g.: if version is "v5.6" or "v5.6.0-SNAPSHOT", the suffix added will be "_v5_6" + String auxVersion = version.replace(".", DBNAME_SEPARATOR).replace("-", DBNAME_SEPARATOR); + String[] split = auxVersion.split(DBNAME_SEPARATOR); + String dbName = DBNAME_PREFIX + DBNAME_SEPARATOR + species.toLowerCase() + DBNAME_SEPARATOR + dbnameAssembly.toLowerCase() + + DBNAME_SEPARATOR + split[0]; + if (split.length > 1) { + dbName += (DBNAME_SEPARATOR + split[1]); + } + return dbName; + } + + public static String cleanAssembly(String assembly) { + if (StringUtils.isEmpty(assembly)) { + throw new InvalidParameterException("Assembly is empty"); + } + + return assembly.replace("\\.", "") + .replace("-", "") + .replace("_", "").toLowerCase(Locale.ROOT); + } + + public static String getSpeciesFromDatabaseName(String databaseName) { + if (StringUtils.isEmpty(databaseName)) { + throw new InvalidParameterException("Database name is empty"); + } + + return databaseName.split(DBNAME_SEPARATOR)[1].toLowerCase(Locale.ROOT); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java index 27e3239f94..6c5d4cf679 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java @@ -26,6 +26,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.utils.DatabaseNameUtils; import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.impl.core.ReleaseMongoDBAdaptor; import org.opencb.commons.datastore.core.DataStoreServerAddress; @@ -49,8 +50,6 @@ public class MongoDBManager { - public static final String DBNAME_SEPARATOR = "_"; - private MongoDataStoreManager mongoDataStoreManager; private final CellBaseConfiguration cellBaseConfiguration; @@ -99,7 +98,7 @@ public MongoDataStore createMongoDBDatastore(String speciesStr, String assemblyS // cellbase_speciesId_assembly_cellbaseVersion // Example: // cellbase_hsapiens_grch37_v3 - String database = getDatabaseName(species.getId(), species.getAssembly(), cellBaseConfiguration.getVersion()); + String database = DatabaseNameUtils.getDatabaseName(species.getId(), species.getAssembly(), cellBaseConfiguration.getVersion()); logger.debug("Database for the species is '{}'", database); return createMongoDBDatastore(database); } catch (CellBaseException e) { @@ -162,29 +161,6 @@ public MongoDataStore createMongoDBDatastore(String database) { return mongoDatastore; } - public static String getDatabaseName(String species, String assembly, String version) { - if (StringUtils.isEmpty(species) || StringUtils.isEmpty(assembly)) { - throw new InvalidParameterException("Species and assembly are required"); - } - - String cleanAssembly = assembly - .replaceAll("\\.", "") - .replaceAll("-", "") - .replaceAll("_", ""); - - // Process version from the configuration file, in order to suffix the database name - // - Production environment, e.g.: if version is "v5", the suffix added wil be "_v5" - // - Test environment, e.g.: if version is "v5.6" or "v5.6.0-SNAPSHOT", the suffix added will be "_v5_6" - String auxVersion = version.replace(".", DBNAME_SEPARATOR).replace("-", DBNAME_SEPARATOR); - String[] split = auxVersion.split(DBNAME_SEPARATOR); - String dbName = "cellbase" + DBNAME_SEPARATOR + species.toLowerCase() + DBNAME_SEPARATOR + cleanAssembly.toLowerCase() - + DBNAME_SEPARATOR + split[0]; - if (split.length > 1) { - dbName += (DBNAME_SEPARATOR + split[1]); - } - return dbName; - } - public Map getDatabaseStatus(String species, String assembly) { MongoDataStore mongoDatastore = createMongoDBDatastore(species, assembly); try { diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java index 4eaf34b026..bfb4eb8680 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java @@ -23,7 +23,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; -import org.opencb.cellbase.lib.db.MongoDBManager; +import org.opencb.cellbase.core.utils.DatabaseNameUtils; import org.opencb.cellbase.lib.loader.LoadRunner; import org.opencb.cellbase.lib.loader.LoaderException; import org.opencb.cellbase.lib.managers.CellBaseManagerFactory; @@ -94,7 +94,7 @@ public GenericMongoDBAdaptorTest() { cellBaseConfiguration.setVersion("v" + versionSplit[0] + "." + versionSplit[1]); cellBaseManagerFactory = new CellBaseManagerFactory(cellBaseConfiguration); - cellBaseName = MongoDBManager.getDatabaseName(SPECIES, ASSEMBLY, cellBaseConfiguration.getVersion()); + cellBaseName = DatabaseNameUtils.getDatabaseName(SPECIES, ASSEMBLY, cellBaseConfiguration.getVersion()); loadRunner = new LoadRunner(MONGODB_CELLBASE_LOADER, cellBaseName, 2, cellBaseManagerFactory.getDataReleaseManager(SPECIES, ASSEMBLY), cellBaseConfiguration); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java index 3e2c755f98..b1c244a9b3 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java @@ -18,13 +18,14 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInstance; +import org.opencb.cellbase.core.utils.DatabaseNameUtils; import org.opencb.cellbase.lib.GenericMongoDBAdaptorTest; import org.opencb.cellbase.lib.db.MongoDBManager; import java.security.InvalidParameterException; import static org.junit.jupiter.api.Assertions.*; -import static org.opencb.cellbase.lib.db.MongoDBManager.DBNAME_SEPARATOR; +import static org.opencb.cellbase.core.utils.DatabaseNameUtils.DBNAME_SEPARATOR; @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class MongoDBAdaptorFactoryTest extends GenericMongoDBAdaptorTest { @@ -46,18 +47,18 @@ public void testGetDatabaseName() { } // provide assembly - String databaseName = mongoDBManager.getDatabaseName("speciesName", "assemblyName", cellBaseConfiguration.getVersion()); + String databaseName = DatabaseNameUtils.getDatabaseName("speciesName", "assemblyName", cellBaseConfiguration.getVersion()); assertEquals("cellbase_speciesname_assemblyname_" + version, databaseName); // don't provide assembly InvalidParameterException thrown = assertThrows(InvalidParameterException.class, - () -> mongoDBManager.getDatabaseName("speciesName", null, cellBaseConfiguration.getVersion()), + () -> DatabaseNameUtils.getDatabaseName("speciesName", null, cellBaseConfiguration.getVersion()), "Expected getDatabaseName() to throw an exception, but it didn't"); assertTrue(thrown.getMessage().contains("Species and assembly are required")); // handle special characters - databaseName = mongoDBManager.getDatabaseName("speciesName", "my_funny.assembly--name", cellBaseConfiguration.getVersion()); + databaseName = DatabaseNameUtils.getDatabaseName("speciesName", "my_funny.assembly--name", cellBaseConfiguration.getVersion()); assertEquals("cellbase_speciesname_myfunnyassemblyname_" + version, databaseName); } } \ No newline at end of file From d556c4c45879ef33139c465ebd95181c6d4c4296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 8 Aug 2024 09:45:25 +0200 Subject: [PATCH 144/148] app: fix sonnar issues, #TASK-6142, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 161 ++++++++++-------- .../org/opencb/cellbase/lib/EtlCommons.java | 4 +- 2 files changed, 91 insertions(+), 74 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 82e9faae59..bca25b48a9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -60,7 +60,6 @@ */ public class LoadCommandExecutor extends CommandExecutor { - private static final String METADATA = "metadata"; private LoadRunner loadRunner; private AdminCliOptionsParser.LoadCommandOptions loadCommandOptions; @@ -77,6 +76,12 @@ public class LoadCommandExecutor extends CommandExecutor { private IndexManager indexManager; private DataReleaseManager dataReleaseManager; + private static final String AUTHENTICATION_DATABASE = "authenticationDatabase"; + + private static final String LOADING_FILE_LOG_MESSAGE = "Loading file '{}'"; + private static final String ERROR_LOADING_FILE_LOG_MESSAGE = "Error loading file '{}': {}"; + private static final String ERROR_LOADING_DATA = "Error loading data in collection "; + public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandOptions) { super(loadCommandOptions.commonOptions.logLevel, loadCommandOptions.commonOptions.conf); @@ -94,9 +99,9 @@ public void execute() throws CellBaseException { if (CollectionUtils.isNotEmpty(dataList)) { // If 'authenticationDatabase' is not passed by argument then we read it from configuration.json - if (loadCommandOptions.loaderParams.containsKey("authenticationDatabase")) { - configuration.getDatabases().getMongodb().getOptions().put("authenticationDatabase", - loadCommandOptions.loaderParams.get("authenticationDatabase")); + if (loadCommandOptions.loaderParams.containsKey(AUTHENTICATION_DATABASE)) { + configuration.getDatabases().getMongodb().getOptions().put(AUTHENTICATION_DATABASE, + loadCommandOptions.loaderParams.get(AUTHENTICATION_DATABASE)); } loadRunner = new LoadRunner(loader, database, numThreads, dataReleaseManager, configuration); if (createIndexes) { @@ -121,27 +126,26 @@ public void execute() throws CellBaseException { } case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: { // Load data - loadIfExists(input.resolve("cadd.json.gz"), "variation_functional_score"); + loadIfExists(input.resolve("cadd.json.gz"), VARIATION_FUNCTIONAL_SCORE_DATA); // Create index - createIndex("variation_functional_score"); + createIndex(VARIATION_FUNCTIONAL_SCORE_DATA); // Update release (collection and sources) List sources = new ArrayList<>(Collections.singletonList(input.resolve("caddVersion.json"))); - dataReleaseManager.update(dataRelease, "variation_functional_score", sources); + dataReleaseManager.update(dataRelease, VARIATION_FUNCTIONAL_SCORE_DATA, sources); break; } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { // Load data - loadIfExists(input.resolve("missense_variation_functional_score.json.gz"), - "missense_variation_functional_score"); + loadIfExists(input.resolve("missense_variation_functional_score.json.gz"), MISSENSE_VARIATION_SCORE_DATA); // Create index - createIndex("missense_variation_functional_score"); + createIndex(MISSENSE_VARIATION_SCORE_DATA); // Update release (collection and sources) List sources = new ArrayList<>(Collections.singletonList(input.resolve("revelVersion.json"))); - dataReleaseManager.update(dataRelease, "missense_variation_functional_score", sources); + dataReleaseManager.update(dataRelease, MISSENSE_VARIATION_SCORE_DATA, sources); break; } case EtlCommons.CONSERVATION_DATA: { @@ -156,11 +160,6 @@ public void execute() throws CellBaseException { loadProtein(); break; } -// case EtlCommons.PPI_DATA: -// loadIfExists(input.resolve("protein_protein_interaction.json.gz"), "protein_protein_interaction"); -// loadIfExists(input.resolve("intactVersion.json"), METADATA); -// createIndex("protein_protein_interaction"); -// break; case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { // Load data, create index and update release loadProteinFunctionalPrediction(); @@ -199,8 +198,12 @@ public void execute() throws CellBaseException { break; } } catch (IllegalAccessException | InstantiationException | InvocationTargetException | ExecutionException - | NoSuchMethodException | InterruptedException | ClassNotFoundException | LoaderException | IOException e) { - e.printStackTrace(); + | NoSuchMethodException | ClassNotFoundException | LoaderException | IOException e) { + logger.error(Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } } } @@ -311,14 +314,12 @@ private void loadVariation() throws NoSuchMethodException, InterruptedException, loadData(input.resolve(VARIATION_DATA), VARIATION_DATA, VARIATION_CHR_PREFIX); } else { // Custom update required e.g. population freqs loading - logger.info("Loading file '{}'", input); + logger.info(LOADING_FILE_LOG_MESSAGE, input); loadRunner.load(input, VARIATION_DATA, dataRelease, field, innerFields); } } - private void loadConservation() throws NoSuchMethodException, InterruptedException, ExecutionException, - InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, - IOException, CellBaseException, LoaderException { + private void loadConservation() throws IOException, CellBaseException { loadData(input.resolve(CONSERVATION_DATA), CONSERVATION_DATA, "conservation_"); } @@ -326,19 +327,20 @@ private void loadProteinFunctionalPrediction() throws NoSuchMethodException, Int InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { // Load data - DirectoryStream stream = Files.newDirectoryStream(input, - entry -> entry.getFileName().toString().startsWith("prot_func_pred_")); + try (DirectoryStream stream = Files.newDirectoryStream(input, + entry -> entry.getFileName().toString().startsWith("prot_func_pred_"))) { - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "protein_functional_prediction", dataRelease); - } + for (Path entry : stream) { + logger.info(LOADING_FILE_LOG_MESSAGE, entry); + loadRunner.load(input.resolve(entry.getFileName()), PROTEIN_FUNCTIONAL_PREDICTION_DATA, dataRelease); + } - // Create index - createIndex("protein_functional_prediction"); + // Create index + createIndex(PROTEIN_FUNCTIONAL_PREDICTION_DATA); - // Update release (collection and sources) - dataReleaseManager.update(dataRelease, "protein_functional_prediction", null); + // Update release (collection and sources) + dataReleaseManager.update(dataRelease, PROTEIN_FUNCTIONAL_PREDICTION_DATA, null); + } } private void loadClinical() throws FileNotFoundException { @@ -347,10 +349,10 @@ private void loadClinical() throws FileNotFoundException { try { // Load data logger.info("Loading '{}' ...", path); - loadRunner.load(path, "clinical_variants", dataRelease); + loadRunner.load(path, CLINICAL_VARIANT_DATA, dataRelease); // Create index - createIndex("clinical_variants"); + createIndex(CLINICAL_VARIANT_DATA); // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( @@ -358,12 +360,14 @@ private void loadClinical() throws FileNotFoundException { input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); - dataReleaseManager.update(dataRelease, "clinical_variants", sources); + dataReleaseManager.update(dataRelease, CLINICAL_VARIANT_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { - logger.error(e.toString()); - } catch (LoaderException e) { - e.printStackTrace(); + | IllegalAccessException | ExecutionException | IOException | LoaderException | CellBaseException e) { + logger.error(Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } } else { throw new FileNotFoundException("File " + path + " does not exist"); @@ -439,13 +443,14 @@ private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionEx ClassNotFoundException, InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException, LoaderException, CellBaseException { // Get files from folder - DirectoryStream stream = Files.newDirectoryStream(spliceFolder, - entry -> entry.getFileName().toString().startsWith("splice_score_")); + try (DirectoryStream stream = Files.newDirectoryStream(spliceFolder, + entry -> entry.getFileName().toString().startsWith("splice_score_"))) { - // Load from JSON files - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(spliceFolder.resolve(entry.getFileName()), "splice_score", dataRelease); + // Load from JSON files + for (Path entry : stream) { + logger.info(LOADING_FILE_LOG_MESSAGE, entry); + loadRunner.load(spliceFolder.resolve(entry.getFileName()), "splice_score", dataRelease); + } } } @@ -456,12 +461,16 @@ private void loadPubMed() throws CellBaseException { // Load data for (File file : pubmedPath.toFile().listFiles()) { if (file.isFile() && (file.getName().endsWith("gz"))) { - logger.info("Loading file '{}'", file.getName()); + logger.info(LOADING_FILE_LOG_MESSAGE, file.getName()); try { loadRunner.load(file.toPath(), PUBMED_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | LoaderException e) { - logger.error("Error loading file '{}': {}", file.getName(), e.toString()); + | IllegalAccessException | ExecutionException | IOException | LoaderException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, file.getName(), Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, file.getName(), Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } } } @@ -486,15 +495,17 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { // Load data Path pharmaJsonPath = pharmaPath.resolve(EtlCommons.PHARMACOGENOMICS_DATA + ".json.gz"); - logger.info("Loading file '{}'", pharmaJsonPath.toFile().getName()); + logger.info(LOADING_FILE_LOG_MESSAGE, pharmaJsonPath.toFile().getName()); try { loadRunner.load(pharmaJsonPath, EtlCommons.PHARMACOGENOMICS_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException - | LoaderException e) { - logger.error("Error loading file '{}': {}", pharmaJsonPath.toFile().getName(), e.toString()); + | IllegalAccessException | ExecutionException | IOException | CellBaseException | LoaderException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, pharmaJsonPath.toFile().getName(), Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, pharmaJsonPath.toFile().getName(), Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } - // Create index createIndex(EtlCommons.PHARMACOGENOMICS_DATA); @@ -516,29 +527,33 @@ private void loadData(Path buildPath, Map collectionMap) throws private void loadData(Path buildPath, String collection, String prefix) throws CellBaseException, IOException { // Load data - DirectoryStream stream = Files.newDirectoryStream(buildPath, entry -> entry.getFileName().toString().startsWith(prefix)); + try (DirectoryStream stream = Files.newDirectoryStream(buildPath, + entry -> entry.getFileName().toString().startsWith(prefix))) { - try { for (Path entry : stream) { logger.info("Loading JSON file '{}' ...", entry); - loadRunner.load(buildPath.resolve(entry.getFileName()), collection, dataRelease); - logger.info(DONE_LOG_MESSAGE); + try { + loadRunner.load(buildPath.resolve(entry.getFileName()), collection, dataRelease); + logger.info(DONE_LOG_MESSAGE); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (Exception e) { + throw new CellBaseException(ERROR_LOADING_DATA + collection, e); + } } - } catch (Exception e) { - throw new CellBaseException("Error loading data in collection '" + collection + "'", e); - } - // Create index - createIndex(collection); + // Create index + createIndex(collection); - // Update the data release collection - dataReleaseManager.update(dataRelease, collection, getVersionPaths(buildPath)); + // Update the data release collection + dataReleaseManager.update(dataRelease, collection, getVersionPaths(buildPath)); + } } private void loadJsonFile(String collection, Path jsonPath) throws CellBaseException { if (!Files.exists(jsonPath)) { - logger.warn("JSON file '{}' not found. No data will be loaded in collection '{}'.", jsonPath, - CellBaseDBAdaptor.buildCollectionName(collection, dataRelease)); + String collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); + logger.warn("JSON file '{}' not found. No data will be loaded in collection '{}'.", jsonPath, collectionName); return; } @@ -547,8 +562,12 @@ private void loadJsonFile(String collection, Path jsonPath) throws CellBaseExcep logger.info("Loading JSON file '{}' ...", jsonPath); loadRunner.load(jsonPath, collection, dataRelease); logger.info(DONE_LOG_MESSAGE); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException(ERROR_LOADING_DATA + collection, e); } catch (Exception e) { - throw new CellBaseException("Error loading data in collection '" + collection + "'", e); + throw new CellBaseException(ERROR_LOADING_DATA + collection, e); } // Create index @@ -596,11 +615,9 @@ private DataRelease getDataReleaseForLoading(DataReleaseManager dataReleaseManag throw new CellBaseException("No data releases are available"); } DataRelease lastDataRelease = null; - for (DataRelease dataRelease : dataReleaseResults.getResults()) { - if (lastDataRelease == null) { - lastDataRelease = dataRelease; - } else if (dataRelease.getRelease() > lastDataRelease.getRelease()) { - lastDataRelease = dataRelease; + for (DataRelease dr : dataReleaseResults.getResults()) { + if (lastDataRelease == null || dr.getRelease() > lastDataRelease.getRelease()) { + lastDataRelease = dr; } } if (lastDataRelease == null) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index adca2e3178..362c1f63cb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -187,8 +187,8 @@ public final class EtlCommons { public static final String REVEL_FILE_ID = "REVEL"; // Clinical variants data - public static final String CLINICAL_VARIANT_DATA = "clinical_variant"; - public static final String CLINICAL_VARIANTS_BASENAME = "clinicalVariant"; + public static final String CLINICAL_VARIANT_DATA = "clinical_variants"; + public static final String CLINICAL_VARIANTS_BASENAME = "clinicalVariants"; // ClinVar public static final String CLINVAR_DATA = "clinvar"; public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; From 1f3572cbaf4318c5ff5c82f6b02a713e88881ac0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 9 Aug 2024 12:33:06 +0200 Subject: [PATCH 145/148] lib: fix the function to save status and message of the downloaded files; improve summary log and update downloader managers according to these changes, #TASK-5575, #TASK-5564 --- .../executors/DownloadCommandExecutor.java | 32 ++- .../admin/executors/LoadCommandExecutor.java | 6 +- .../org/opencb/cellbase/lib/EtlCommons.java | 15 +- .../lib/builders/PharmGKBBuilder.java | 2 +- .../variant/ClinicalVariantBuilder.java | 6 +- .../lib/download/AbstractDownloadManager.java | 100 ++++---- .../lib/download/CaddDownloadManager.java | 23 +- .../lib/download/ClinicalDownloadManager.java | 93 +++---- .../download/ConservationDownloadManager.java | 233 ++++++++---------- .../lib/download/GeneDownloadManager.java | 98 +++----- .../lib/download/GenomeDownloadManager.java | 13 +- .../MissenseScoresDownloadManager.java | 19 +- .../lib/download/OntologyDownloadManager.java | 92 +++---- .../lib/download/PharmGKBDownloadManager.java | 49 ++-- .../lib/download/ProteinDownloadManager.java | 81 +++--- .../lib/download/PubMedDownloadManager.java | 17 +- .../download/RegulationDownloadManager.java | 77 +++--- .../lib/download/RepeatsDownloadManager.java | 125 +++++----- .../download/SpliceScoreDownloadManager.java | 12 +- .../download/VariationDownloadManager.java | 22 +- 20 files changed, 530 insertions(+), 585 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 1b7955b095..6d3221869b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -16,6 +16,7 @@ package org.opencb.cellbase.app.cli.admin.executors; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; @@ -26,9 +27,7 @@ import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -62,10 +61,12 @@ public void execute() throws CellBaseException { throw new CellBaseException("Invalid species: '" + downloadCommandOptions.speciesAndAssemblyOptions.species + "'"); } List dataList = getDataList(species, speciesConfiguration); - logger.info("Downloading the following data sources: {}", StringUtils.join(dataList, ",")); + logger.info("Downloading the following data sources: {}", CollectionUtils.isEmpty(dataList) + ? Collections.emptyList() + : StringUtils.join(dataList, ",")); List downloadFiles = new ArrayList<>(); - AbstractDownloadManager downloader; + AbstractDownloadManager downloader = null; for (String data : dataList) { switch (data) { case GENOME_DATA: @@ -119,7 +120,16 @@ public void execute() throws CellBaseException { // Call to download method and add the files to the list downloadFiles.addAll(downloader.download()); } - AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); + if (downloader != null) { + Map params = new HashMap<>(); + params.put("species", species); + params.put("assembly", assembly); + params.put("data", dataList); + params.put("outDir", outputDirectory); + downloader.writeDownloadLogFile(params, downloadFiles); + } else { + logger.warn("Impossible to write log summary: downloader is null"); + } } catch (InterruptedException e) { // Restore interrupted state... Thread.currentThread().interrupt(); @@ -138,13 +148,17 @@ private List getDataList(String species, SpeciesConfiguration speciesCon } else { // Check if the data sources requested are valid for the species dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + Set invalidData = new HashSet<>(); for (String data : dataList) { if (!speciesConfig.getData().contains(data)) { - throw new CellBaseException("Data parameter '" + data + "' does not exist or it is not allowed for '" + species + "'. " - + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + ". " - + "You can use data parameter 'all' to download everything"); + invalidData.add(data); } } + if (!CollectionUtils.isEmpty(invalidData)) { + throw new CellBaseException("Data '" + StringUtils.join(invalidData, ",") + "' not supported by species '" + species + "'." + + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + ". Our use data parameter 'all' to" + + " download everything"); + } } return dataList; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index bca25b48a9..6db7e4dc6e 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -534,7 +534,7 @@ private void loadData(Path buildPath, String collection, String prefix) throws C logger.info("Loading JSON file '{}' ...", entry); try { loadRunner.load(buildPath.resolve(entry.getFileName()), collection, dataRelease); - logger.info(DONE_LOG_MESSAGE); + logger.info(DONE_MSG); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } catch (Exception e) { @@ -561,7 +561,7 @@ private void loadJsonFile(String collection, Path jsonPath) throws CellBaseExcep // Load data logger.info("Loading JSON file '{}' ...", jsonPath); loadRunner.load(jsonPath, collection, dataRelease); - logger.info(DONE_LOG_MESSAGE); + logger.info(DONE_MSG); } catch (InterruptedException e) { // Restore interrupted state... Thread.currentThread().interrupt(); @@ -587,7 +587,7 @@ private void createIndex(String collection) { collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); logger.info("Creating indexes for collection '{}' ...", collectionName); indexManager.createMongoDBIndexes(Collections.singletonList(collectionName), true); - logger.info(DONE_LOG_MESSAGE); + logger.info(DONE_MSG); } catch (IOException e) { logger.error("Error creating indexes for collection '{}': {}", collectionName, Arrays.toString(e.getStackTrace())); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 362c1f63cb..4e469e2258 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -65,8 +65,9 @@ public final class EtlCommons { public static final String FAI_EXTENSION = ".fai"; public static final String GZ_EXTENSION = ".gz"; - public static final String OK_LOG_MESSAGE = "Ok."; - public static final String DONE_LOG_MESSAGE = "Done."; + public static final String OK_MSG = "Ok."; + public static final String DONE_MSG = "Done."; + public static final String DATA_NOT_SUPPORTED_MSG = "Data '{}' not supported for species '{}'"; // Ensembl public static final String ENSEMBL_DATA = "ensembl"; @@ -499,10 +500,10 @@ private EtlCommons() { throw new IllegalStateException("Utility class"); } - public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) + public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, Path logFile) throws IOException, InterruptedException, CellBaseException { - ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); + ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFile); LOGGER.info("Executing command: {}", StringUtils.join(builder.command(), " ")); Process process = builder.start(); @@ -519,7 +520,7 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat return true; } - private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { + private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, Path logFile) { List commandArgs = new ArrayList<>(); commandArgs.add(binPath); commandArgs.addAll(args); @@ -530,8 +531,8 @@ private static ProcessBuilder getProcessBuilder(File workingDirectory, String bi builder.directory(workingDirectory); } builder.redirectErrorStream(true); - if (logFilePath != null) { - builder.redirectOutput(ProcessBuilder.Redirect.appendTo(new File(logFilePath))); + if (logFile != null) { + builder.redirectOutput(ProcessBuilder.Redirect.appendTo(logFile.toFile())); } return builder; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index dc5bb32ee2..1cfd85ae07 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -1057,7 +1057,7 @@ private void unzipDownloadedFiles(List pharmGkbFiles) throws CellBaseExcep try { String outPath = serializer.getOutdir().resolve(pharmGgkFile.getName().split("\\.")[0]).toString(); List params = Arrays.asList("-d", outPath, "-o", pharmGgkFile.toString()); - EtlCommons.runCommandLineProcess(null, "unzip", params, Paths.get(outPath + ".log").toString()); + EtlCommons.runCommandLineProcess(null, "unzip", params, Paths.get(outPath + ".log")); } catch (CellBaseException e) { if (pharmGgkFile.getName().contains(GUIDELINE_ANNOTATIONS_BASENAME)) { // It fails because of long filenames, so it does not raise any exception diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index e3b18cd147..9b3457dc78 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -89,10 +89,10 @@ public void check() throws CellBaseException, IOException { if (!Files.exists(genomeSequenceFilePath)) { throw new CellBaseException("Genome file path does not exist " + genomeSequenceFilePath); } - logger.info(OK_LOG_MESSAGE); + logger.info(OK_MSG); logger.info("Checking index for genome FASTA file ..."); getIndexFastaReferenceGenome(genomeSequenceFilePath); - logger.info(OK_LOG_MESSAGE); + logger.info(OK_MSG); // Check ClinVar files clinvarFullReleaseFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_FULL_RELEASE_FILE_ID, @@ -137,7 +137,7 @@ public void parse() throws IOException, RocksDBException, CellBaseException { Files.createDirectories(chunksPath); logger.info("Splitting CliVar file {} in {} ...", clinvarFullReleaseFilePath, chunksPath); splitClinvar(clinvarFullReleaseFilePath, chunksPath); - logger.info(OK_LOG_MESSAGE); + logger.info(OK_MSG); } RocksDB rdb = null; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 3f852ec8ad..d88ef5d389 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -35,13 +35,11 @@ import org.slf4j.LoggerFactory; import java.io.BufferedReader; -import java.io.File; import java.io.FileReader; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.time.LocalDateTime; @@ -51,13 +49,13 @@ public abstract class AbstractDownloadManager { - protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done."; - protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; - protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done."; - protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; - protected static final String DATA_ALREADY_DOWNLOADED = "The file {} already exists, indicating that the data {} has already been" - + " downloaded."; + protected static final String DOWNLOADING_MSG = "Downloading {} ..."; + protected static final String DOWNLOADING_DONE_MSG = "Downloading {} done."; + protected static final String CATEGORY_DOWNLOADING_MSG = "Downloading {}/{} ..."; + protected static final String CATEGORY_DOWNLOADING_DONE_MSG = "Downloading {}/{} done."; + protected static final String DOWNLOADING_FROM_TO_MSG = "Downloading {} to {} ..."; + protected static final String DATA_ALREADY_DOWNLOADED_MSG = "The file {} already exists, indicating that the data {} has already been" + + " downloaded."; protected String species; protected String assembly; @@ -119,16 +117,16 @@ private void init() throws CellBaseException, IOException { // Prepare outdir Path speciesFolder = outdir.resolve(speciesShortName + "_" + assemblyConfiguration.getName().toLowerCase()); downloadFolder = outdir.resolve(speciesFolder + "/download"); - logger.info("Creating download dir {}", downloadFolder); + logger.info("Creating download dir: {}", downloadFolder); Files.createDirectories(downloadFolder); downloadLogFolder = outdir.resolve(speciesFolder + "/download/log"); - logger.info("Creating download log dir {}", downloadLogFolder); + logger.info("Creating download log dir: {}", downloadLogFolder); Files.createDirectories(downloadLogFolder); // /_/generated_json buildFolder = outdir.resolve(speciesFolder + "/generated_json"); - logger.info("Creating build dir {}", buildFolder); + logger.info("Creating build dir: {}", buildFolder); Files.createDirectories(buildFolder); logger.info("Processing species {}", speciesConfiguration.getScientificName()); @@ -200,11 +198,8 @@ protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props String chromosome, Path outPath) throws IOException, InterruptedException, CellBaseException { String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); - File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); - DownloadFile downloadFile = downloadFile(url, outFile.toString()); - logger.info(OK_LOG_MESSAGE); - return downloadFile; + Path outFile = outPath.resolve(getFilenameFromUrl(url)); + return downloadFile(url, outFile); } protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) @@ -216,11 +211,8 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope Path outPath) throws IOException, InterruptedException, CellBaseException { String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), chromosome); - File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); - DownloadFile downloadFile = downloadFile(url, outFile.toString()); - logger.info(OK_LOG_MESSAGE); - return downloadFile; + Path outFile = outPath.resolve(getFilenameFromUrl(url)); + return downloadFile(url, outFile); } protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) @@ -284,71 +276,85 @@ protected String getPhylo(SpeciesConfiguration sp) { } } - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException, CellBaseException { - return downloadFile(url, outputFileName, null); + protected DownloadFile downloadFile(String url, Path outputFile) throws IOException, InterruptedException, CellBaseException { + return downloadFile(url, outputFile, null); } - protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) + protected DownloadFile downloadFile(String url, Path outputFile, List wgetAdditionalArgs) throws IOException, InterruptedException, CellBaseException { - DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); + DownloadFile downloadFile = new DownloadFile(url, outputFile.toAbsolutePath().toString(), + Timestamp.valueOf(LocalDateTime.now()).toString()); Long startTime = System.currentTimeMillis(); - if (Paths.get(outputFileName).toFile().exists()) { - logger.warn("File '{}' is already downloaded", outputFileName); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, "File '" + outputFileName + "' is already downloaded", true); + final Path outputLog = downloadLogFolder.resolve(outputFile.getFileName().toString() + ".log"); + if (Files.exists(outputFile)) { + logger.warn("File '{}' is already downloaded", outputFile); + setDownloadStatusAndMessage(outputFile, downloadFile, outputLog, true); + downloadFile.setMessage("File is already downloaded"); } else { - final String outputLog = downloadLogFolder + "/" + Paths.get(outputFileName).toFile().getName() + ".log"; - List wgetArgs = new ArrayList<>(Arrays.asList("--tries=10", url, "-O", outputFileName, "-o", outputLog)); + logger.info(DOWNLOADING_FROM_TO_MSG, url, outputFile); + List wgetArgs = new ArrayList<>(Arrays.asList("--tries=10", url, + "-O", outputFile.toAbsolutePath().toString(), + "-o", outputLog.toAbsolutePath().toString())); if (wgetAdditionalArgs != null && !wgetAdditionalArgs.isEmpty()) { wgetArgs.addAll(wgetAdditionalArgs); } boolean downloaded = EtlCommons.runCommandLineProcess(null, "wget", wgetArgs, outputLog); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, outputLog, downloaded); + setDownloadStatusAndMessage(outputFile, downloadFile, outputLog, downloaded); + logger.info(OK_MSG); } - downloadFileInfo.setElapsedTime(startTime, System.currentTimeMillis()); - return downloadFileInfo; + downloadFile.setElapsedTime(startTime, System.currentTimeMillis()); + return downloadFile; } - private void setDownloadStatusAndMessage(String outputFileName, DownloadFile downloadFile, String outputLog, boolean downloaded) { + private void setDownloadStatusAndMessage(Path outputFile, DownloadFile downloadFile, Path logFile, boolean downloaded) { if (downloaded) { - boolean validFileSize = validateDownloadFile(downloadFile, outputFileName, outputLog); + boolean validFileSize = validateDownloadFile(downloadFile, outputFile, logFile); if (validFileSize) { downloadFile.setStatus(DownloadFile.Status.OK); downloadFile.setMessage("File downloaded successfully"); } else { downloadFile.setStatus(DownloadFile.Status.ERROR); downloadFile.setMessage("Expected downloaded file size " + downloadFile.getExpectedFileSize() - + ", Actual file size " + downloadFile.getActualFileSize()); + + ", actual file size " + downloadFile.getActualFileSize()); } } else { - downloadFile.setMessage("See full error message in " + outputLog); + downloadFile.setMessage("See full error message in " + logFile); downloadFile.setStatus(DownloadFile.Status.ERROR); } } - public static void writeDownloadLogFile(Path downloadFolder, List downloadFiles) throws IOException { + public void writeDownloadLogFile(Map params, List downloadFiles) throws IOException { + // Get current date and time + String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); + Path summaryPath = downloadLogFolder.resolve(timeStamp + "_summary.json"); + + Map summary = new HashMap<>(); + summary.put("params", params); + summary.put("downloadFiles", downloadFiles); + ObjectMapper mapper = new ObjectMapper(); ObjectWriter writer = mapper.writer(new DefaultPrettyPrinter()); - writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles); + writer.writeValue(summaryPath.toFile(), summary); } public boolean isAlreadyDownloaded(Path path, String dataName) { if (Files.exists(path)) { - logger.info(DATA_ALREADY_DOWNLOADED, path.getFileName(), dataName); + logger.info(DATA_ALREADY_DOWNLOADED_MSG, path.getFileName(), dataName); return true; } return false; } - private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) { - long expectedFileSize = getExpectedFileSize(outputFileLog); - long actualFileSize = FileUtils.sizeOf(new File(outputFileName)); + private boolean validateDownloadFile(DownloadFile downloadFile, Path outputFile, Path logFile) { + long expectedFileSize = getExpectedFileSize(logFile); + long actualFileSize = FileUtils.sizeOf(outputFile.toFile()); downloadFile.setActualFileSize(actualFileSize); downloadFile.setExpectedFileSize(expectedFileSize); return expectedFileSize == actualFileSize; } - private long getExpectedFileSize(String outputFileLog) { - try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { + private long getExpectedFileSize(Path path) { + try (BufferedReader reader = new BufferedReader(new FileReader(path.toFile()))) { String line; while ((line = reader.readLine()) != null) { // looking for: Length: 13846591 (13M) @@ -358,7 +364,7 @@ private long getExpectedFileSize(String outputFileLog) { } } } catch (Exception e) { - logger.info("Error getting expected file size {}", e.getMessage()); + logger.info("Error getting expected file size: {}. Stack trace: {}", e.getMessage(), Arrays.toString(e.getStackTrace())); } return -1; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index e010bb676a..c128b1d67d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -37,20 +37,23 @@ public CaddDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - DownloadFile downloadFile = null; + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_FUNCTIONAL_SCORE_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(VARIATION_FUNCTIONAL_SCORE_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_FUNCTIONAL_SCORE_DATA)) { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + logger.info(CATEGORY_DOWNLOADING_MSG, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); - // Create the CADD download path - Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); - Files.createDirectories(caddDownloadPath); + // Create the CADD download path + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Files.createDirectories(caddDownloadPath); - // Download CADD and save data source - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, caddDownloadPath); + // Download CADD and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, + caddDownloadPath); - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); - } + logger.info(CATEGORY_DOWNLOADING_DONE_MSG, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); return Collections.singletonList(downloadFile); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 298634c6eb..e70e3d297b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -45,55 +45,58 @@ public List download() throws IOException, InterruptedException, C } public List downloadClinical() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CLINICAL_VARIANT_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(CLINICAL_VARIANT_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); - // Check if the species has the data to download - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CLINICAL_VARIANT_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); - - // Create clinical directory - Path clinicalPath = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANT_DATA).toAbsolutePath(); - Files.createDirectories(clinicalPath); - - DownloadFile downloadFile; - - // ClinVar - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINVAR_DATA)); - DownloadProperties.URLProperties props = configuration.getDownload().getClinvar(); - List urls = new ArrayList<>(); - for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, - CLINVAR_EFO_TERMS_FILE_ID)) { - downloadFile = downloadDataSource(props, fileId, clinicalPath); - downloadFiles.add(downloadFile); - - // Save URLs to be written in the version file - urls.add(downloadFile.getUrl()); - } - // Save data source - saveDataSource(CLINVAR_DATA, props.getVersion(), getTimeStamp(), urls, - clinicalPath.resolve(getDataVersionFilename(CLINVAR_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CLINVAR_DATA)); - - // COSMIC - logger.warn("{} files must be downloaded manually !", getDataName(COSMIC_DATA)); - props = configuration.getDownload().getCosmic(); - String url = props.getHost() + props.getFiles().get(COSMIC_FILE_ID); - saveDataSource(COSMIC_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), - clinicalPath.resolve(getDataVersionFilename(COSMIC_DATA))); - - // HGMD - logger.warn("{} files must be downloaded manually !", getDataName(HGMD_DATA)); - props = configuration.getDownload().getHgmd(); - url = props.getHost() + props.getFiles().get(HGMD_FILE_ID); - saveDataSource(HGMD_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), - clinicalPath.resolve(getDataVersionFilename(HGMD_DATA))); - - // GWAS catalog - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GWAS_DATA)); - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_DATA, clinicalPath); + logger.info(DOWNLOADING_MSG, getDataName(CLINICAL_VARIANT_DATA)); + + // Create clinical directory + Path clinicalPath = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANT_DATA).toAbsolutePath(); + Files.createDirectories(clinicalPath); + + + // ClinVar + logger.info(DOWNLOADING_MSG, getDataName(CLINVAR_DATA)); + DownloadProperties.URLProperties props = configuration.getDownload().getClinvar(); + List urls = new ArrayList<>(); + for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, + CLINVAR_EFO_TERMS_FILE_ID)) { + downloadFile = downloadDataSource(props, fileId, clinicalPath); downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GWAS_DATA)); + + // Save URLs to be written in the version file + urls.add(downloadFile.getUrl()); } + // Save data source + saveDataSource(CLINVAR_DATA, props.getVersion(), getTimeStamp(), urls, + clinicalPath.resolve(getDataVersionFilename(CLINVAR_DATA))); + logger.info(DOWNLOADING_DONE_MSG, getDataName(CLINVAR_DATA)); + + // COSMIC + logger.warn("{} files must be downloaded manually !", getDataName(COSMIC_DATA)); + props = configuration.getDownload().getCosmic(); + String url = props.getHost() + props.getFiles().get(COSMIC_FILE_ID); + saveDataSource(COSMIC_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(COSMIC_DATA))); + + // HGMD + logger.warn("{} files must be downloaded manually !", getDataName(HGMD_DATA)); + props = configuration.getDownload().getHgmd(); + url = props.getHost() + props.getFiles().get(HGMD_FILE_ID); + saveDataSource(HGMD_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(HGMD_DATA))); + + // GWAS catalog + logger.info(DOWNLOADING_MSG, getDataName(GWAS_DATA)); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_DATA, clinicalPath); + downloadFiles.add(downloadFile); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GWAS_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java index 7537e703f6..41725c0f54 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java @@ -50,148 +50,121 @@ public List download() throws IOException, InterruptedException, C * @throws CellBaseException if there is an error executing the command line */ public List downloadConservation() throws IOException, InterruptedException, CellBaseException { - List downloadFiles = new ArrayList<>(); + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CONSERVATION_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(CONSERVATION_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } - // Check if the species is supported - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CONSERVATION_DATA)) { - - // Create folders - Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); - Files.createDirectories(conservationFolder); - Path gerpFolder = Files.createDirectories(conservationFolder.resolve(GERP_DATA)); - Path phastConsFolder = Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); - Path phyloPFolder = Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); - - // Already downloaded ? - boolean downloadGerp = !isAlreadyDownloaded(gerpFolder.resolve(getDataVersionFilename(GERP_DATA)), getDataName(GERP_DATA)); - boolean downloadPhastCons = !isAlreadyDownloaded(phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA)), - getDataName(PHASTCONS_DATA)); - boolean downloadPhyloP = !isAlreadyDownloaded(phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA)), - getDataName(PHYLOP_DATA)); - - if (!downloadGerp && !downloadPhastCons && !downloadPhyloP) { - return new ArrayList<>(); - } + List downloadFiles = new ArrayList<>(); - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); - - // Download data - String filename; - Path outputPath; - - // Prepare variables - String phastconsHost = configuration.getDownload().getPhastCons().getHost(); - String phylopHost = configuration.getDownload().getPhylop().getHost(); - List phastconsUrls = new ArrayList<>(50); - List phyloPUrls = new ArrayList<>(50); - String gerpUrl = null; - - // Human - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { - // 1. PhastCons and PhyloP - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"}; - for (String chromosome : chromosomes) { - if (downloadPhastCons) { - logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); - String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) - + "chr" + chromosome + ".phastCons470way.wigFix.gz"; - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - phastconsUrls.add(phastConsUrl); - logger.info(OK_LOG_MESSAGE); - } - - if (downloadPhyloP) { - logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); - String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) - + "chr" + chromosome + ".phyloP470way.wigFix.gz"; - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - phyloPUrls.add(phyloPUrl); - logger.info(OK_LOG_MESSAGE); - } - } - - // 2. Gerp - if (downloadGerp) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - gerpUrl = configuration.getDownload().getGerp().getHost() - + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - } + // Create folders + Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); + Files.createDirectories(conservationFolder); + Path gerpFolder = Files.createDirectories(conservationFolder.resolve(GERP_DATA)); + Path phastConsFolder = Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); + Path phyloPFolder = Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); + + logger.info(DOWNLOADING_MSG, getDataName(CONSERVATION_DATA)); + + // Download data + String filename; + Path outputPath; + + // Prepare variables + String phastconsHost = configuration.getDownload().getPhastCons().getHost(); + String phylopHost = configuration.getDownload().getPhylop().getHost(); + List phastconsUrls = new ArrayList<>(50); + List phyloPUrls = new ArrayList<>(50); + String gerpUrl = null; + + // Human + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + // 1. PhastCons and PhyloP + String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"}; + for (String chromosome : chromosomes) { + logger.info(DOWNLOADING_MSG, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); + String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons470way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath)); + phastconsUrls.add(phastConsUrl); + logger.info(OK_MSG); + + logger.info(DOWNLOADING_MSG, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP470way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath)); + phyloPUrls.add(phyloPUrl); + logger.info(OK_MSG); } - // Mouse - if (speciesConfiguration.getScientificName().equals(MUS_MUSCULUS)) { - String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - - // 1. PhastCons and PhyloP - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "X", "Y", "M"}; - for (String chromosome : chromosomes) { - if (downloadPhastCons) { - logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); - String phastConsUrl = phastconsHost - + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) - + "chr" + chromosome + ".phastCons35way.wigFix.gz"; - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - phastconsUrls.add(phastConsUrl); - logger.info(OK_LOG_MESSAGE); - } - - if (downloadPhyloP) { - logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); - String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) - + "chr" + chromosome + ".phyloP35way.wigFix.gz"; - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - phyloPUrls.add(phyloPUrl); - logger.info(OK_LOG_MESSAGE); - } - } - - // 2. Gerp - if (downloadGerp) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - gerpUrl = configuration.getDownload().getGerp().getHost() - + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - } - } + // 2. Gerp + logger.info(DOWNLOADING_MSG, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath)); + logger.info(OK_MSG); + } - // Save data version - if (downloadPhastCons) { - saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, - phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); - } - if (downloadPhyloP) { - saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, - phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); - } - if (downloadGerp) { - saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), - Collections.singletonList(gerpUrl), gerpFolder.resolve(getDataVersionFilename(GERP_DATA))); + // Mouse + if (speciesConfiguration.getScientificName().equals(MUS_MUSCULUS)) { + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + // 1. PhastCons and PhyloP + String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "X", "Y", "M"}; + for (String chromosome : chromosomes) { + logger.info(DOWNLOADING_MSG, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); + String phastConsUrl = phastconsHost + + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons35way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath)); + phastconsUrls.add(phastConsUrl); + logger.info(OK_MSG); + + logger.info(DOWNLOADING_MSG, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP35way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath)); + phyloPUrls.add(phyloPUrl); + logger.info(OK_MSG); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + // 2. Gerp + logger.info(DOWNLOADING_MSG, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath)); + logger.info(OK_MSG); } + + // Save data version + saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, + phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); + saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, + phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); + saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + Collections.singletonList(gerpUrl), gerpFolder.resolve(getDataVersionFilename(GERP_DATA))); + + logger.info(DOWNLOADING_DONE_MSG, getDataName(CONSERVATION_DATA)); + return downloadFiles; } private String getChromDownloadMessage(String dataName, String chromosome) { return dataName + ", chrom. " + chromosome; } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 6bb219fb2e..50163b58a1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -42,7 +42,7 @@ public GeneDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GENE_DATA)); // Create gene folder Path geneDownloadPath = downloadFolder.resolve(GENE_DATA); @@ -70,7 +70,7 @@ public List download() throws IOException, InterruptedException, C downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath)); // Gene annotation - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GENE_ANNOTATION_DATA)); downloadFiles.add(downloadMane(geneDownloadPath)); downloadFiles.add(downloadLrg(geneDownloadPath)); downloadFiles.add(downloadHgnc(geneDownloadPath)); @@ -80,7 +80,7 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadGeneExpressionAtlas(geneDownloadPath)); downloadFiles.add(downloadGnomadConstraints(geneDownloadPath)); downloadFiles.add(downloadGO(geneDownloadPath)); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GENE_ANNOTATION_DATA)); // Save data sources manually downloaded if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { @@ -107,7 +107,7 @@ public List download() throws IOException, InterruptedException, C } } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GENE_DATA)); return downloadFiles; } @@ -116,12 +116,7 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws // Check if the species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { - // Already downloaded ? - if (isAlreadyDownloaded(ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA)), getDataName(ENSEMBL_DATA))) { - return downloadFiles; - } - - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); + logger.info(CATEGORY_DOWNLOADING_MSG, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); DownloadProperties.EnsemblProperties ensemblConfig = configuration.getDownload().getEnsembl(); // GTF, DNA, RNA @@ -140,7 +135,7 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), urls, ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); + logger.info(CATEGORY_DOWNLOADING_DONE_MSG, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); } return downloadFiles; } @@ -152,9 +147,8 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { // GTF, DNA, RNA, Protein String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getRefSeq().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID) - && !isAlreadyDownloaded(refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA)), getDataName(REFSEQ_DATA))) { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); + if (configuration.getDownload().getRefSeq().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID)) { + logger.info(CATEGORY_DOWNLOADING_MSG, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq(); downloadFiles.add(downloadDataSource(refSeqConfig, prefixId + REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); @@ -166,7 +160,7 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce saveDataSource(REFSEQ_DATA, refSeqConfig.getVersion(), getTimeStamp(), getUrls(downloadFiles), refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA))); - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); + logger.info(CATEGORY_DOWNLOADING_DONE_MSG, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); } } return downloadFiles; @@ -181,7 +175,7 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, return; } - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(ENSEMBL_CANONICAL_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -202,7 +196,7 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, logger.error("Error executing script {}: {}", params, e.getStackTrace()); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(ENSEMBL_CANONICAL_DATA)); } public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, CellBaseException { @@ -216,7 +210,7 @@ public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, Cel return; } - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GENE_EXTRA_INFO_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -238,22 +232,20 @@ public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, Cel logger.error("Error executing script {}: {}", params, e.getStackTrace()); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GENE_EXTRA_INFO_DATA)); } private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(MANE_SELECT_DATA)), - getDataName(MANE_SELECT_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + logger.info(DOWNLOADING_MSG, getDataName(MANE_SELECT_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, MANE_SELECT_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(MANE_SELECT_DATA)); } return downloadFile; } @@ -263,11 +255,11 @@ private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, Inte // Check if the species is supported if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(LRG_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(LRG_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(LRG_DATA)); } return downloadFile; } @@ -276,13 +268,12 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(HGNC_DATA)), getDataName(HGNC_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + logger.info(DOWNLOADING_MSG, getDataName(HGNC_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(HGNC_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(HGNC_DATA)); } return downloadFile; } @@ -291,15 +282,13 @@ private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOExcep DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(CANCER_HOTSPOT_DATA)), - getDataName(CANCER_HOTSPOT_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + logger.info(DOWNLOADING_MSG, getDataName(CANCER_HOTSPOT_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, CANCER_HOTSPOT_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(CANCER_HOTSPOT_DATA)); } return downloadFile; } @@ -308,13 +297,12 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(DGIDB_DATA)), getDataName(DGIDB_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + logger.info(DOWNLOADING_MSG, getDataName(DGIDB_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(DGIDB_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(DGIDB_DATA)); } return downloadFile; } @@ -324,15 +312,13 @@ private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOExc // Check if the species is supported String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(UNIPROT_XREF_DATA)), - getDataName(UNIPROT_XREF_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); + if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)) { + logger.info(DOWNLOADING_MSG, getDataName(UNIPROT_XREF_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), prefixId + UNIPROT_XREF_FILE_ID, UNIPROT_XREF_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(UNIPROT_XREF_DATA)); } return downloadFile; } @@ -341,15 +327,13 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GENE_EXPRESSION_ATLAS_DATA)), - getDataName(GENE_EXPRESSION_ATLAS_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + logger.info(DOWNLOADING_MSG, getDataName(GENE_EXPRESSION_ATLAS_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_DATA, geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GENE_EXPRESSION_ATLAS_DATA)); } return downloadFile; } @@ -358,15 +342,13 @@ private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOE DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GNOMAD_CONSTRAINTS_DATA)), - getDataName(GNOMAD_CONSTRAINTS_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + logger.info(DOWNLOADING_MSG, getDataName(GNOMAD_CONSTRAINTS_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, GNOMAD_CONSTRAINTS_DATA, geneDownloadPath); - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GNOMAD_CONSTRAINTS_DATA)); } return downloadFile; } @@ -376,15 +358,13 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter // Check if the species is supported String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID) - && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GO_ANNOTATION_DATA)), - getDataName(GO_ANNOTATION_DATA))) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); + if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)) { + logger.info(DOWNLOADING_MSG, getDataName(GO_ANNOTATION_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), prefixId + GO_ANNOTATION_FILE_ID, GO_ANNOTATION_DATA, geneDownloadPath); - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GO_ANNOTATION_DATA)); } return downloadFile; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 30cd8248ae..417e34831c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -53,12 +53,7 @@ public List download() throws IOException, InterruptedException, C public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { Path genomeVersionFilePath = sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA)); - // Already downloaded - if (isAlreadyDownloaded(genomeVersionFilePath, getDataName(GENOME_DATA))) { - return new ArrayList<>(); - } - - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GENOME_DATA)); Files.createDirectories(sequenceFolder); List urls = new ArrayList<>(); @@ -75,7 +70,7 @@ public List downloadReferenceGenome() throws IOException, Interrup // Save data source saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), urls, genomeVersionFilePath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GENOME_DATA)); return Collections.singletonList(downloadFile); } @@ -86,7 +81,7 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { return; } - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(GENOME_INFO_DATA)); Files.createDirectories(sequenceFolder); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -108,6 +103,6 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(GENOME_INFO_DATA)); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index efb94227d4..58461018b4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -37,14 +37,17 @@ public MissenseScoresDownloadManager(String species, String assembly, Path targe @Override public List download() throws IOException, InterruptedException, CellBaseException { - DownloadFile downloadFile = null; - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), MISSENSE_VARIATION_SCORE_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), MISSENSE_VARIATION_SCORE_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(MISSENSE_VARIATION_SCORE_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } - downloadFile = downloadRevel(); + logger.info(DOWNLOADING_MSG, getDataName(MISSENSE_VARIATION_SCORE_DATA)); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); - } + DownloadFile downloadFile = downloadRevel(); + + logger.info(DOWNLOADING_DONE_MSG, getDataName(MISSENSE_VARIATION_SCORE_DATA)); return Collections.singletonList(downloadFile); } @@ -56,7 +59,7 @@ public DownloadFile downloadRevel() throws IOException, InterruptedException, Ce // Check if the species is supported if (configuration.getDownload().getRevel().getFiles().containsKey(prefixId + REVEL_FILE_ID)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(REVEL_DATA)); // Create the REVEL download path Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); @@ -66,7 +69,7 @@ public DownloadFile downloadRevel() throws IOException, InterruptedException, Ce downloadFile = downloadAndSaveDataSource(configuration.getDownload().getRevel(), prefixId + REVEL_FILE_ID, REVEL_DATA, revelDownloadPath); - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(REVEL_DATA)); } return downloadFile; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index cabfd2339b..5e262d6796 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -42,58 +42,60 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec } public List download() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), ONTOLOGY_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(ONTOLOGY_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + logger.info(DOWNLOADING_MSG, getDataName(ONTOLOGY_DATA)); + + Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); + Files.createDirectories(oboFolder); + + String version; + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); - // Check if the species has the data to download - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), ONTOLOGY_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); - - Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); - Files.createDirectories(oboFolder); - - String version; - DownloadFile downloadFile; - - if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { - // HPO - Files.createDirectories(oboFolder.resolve(HPO_OBO_DATA)); - downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, - oboFolder.resolve(HPO_OBO_DATA)); - version = getVersionFromOboFile(oboFolder.resolve(HPO_OBO_DATA).resolve(downloadFile.getOutputFile())); - saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(HPO_OBO_DATA).resolve(getDataVersionFilename(HPO_OBO_DATA))); - downloadFiles.add(downloadFile); - - // DOID - Files.createDirectories(oboFolder.resolve(DOID_OBO_DATA)); - downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, - oboFolder.resolve(DOID_OBO_DATA)); - version = getVersionFromOboFile(oboFolder.resolve(DOID_OBO_DATA).resolve(downloadFile.getOutputFile())); - saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(DOID_OBO_DATA).resolve(getDataVersionFilename(DOID_OBO_DATA))); - downloadFiles.add(downloadFile); - - // Mondo - Files.createDirectories(oboFolder.resolve(MONDO_OBO_DATA)); - downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, - oboFolder.resolve(MONDO_OBO_DATA)); - version = getVersionFromOboFile(oboFolder.resolve(MONDO_OBO_DATA).resolve(downloadFile.getOutputFile())); - saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(MONDO_OBO_DATA).resolve(getDataVersionFilename(MONDO_OBO_DATA))); - downloadFiles.add(downloadFile); - } + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + // HPO + Files.createDirectories(oboFolder.resolve(HPO_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, + oboFolder.resolve(HPO_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(HPO_OBO_DATA).resolve(downloadFile.getOutputFile())); + saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(HPO_OBO_DATA).resolve(getDataVersionFilename(HPO_OBO_DATA))); + downloadFiles.add(downloadFile); - // GO - Files.createDirectories(oboFolder.resolve(GO_OBO_DATA)); - downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder.resolve(GO_OBO_DATA)); - version = getVersionFromOboFile(oboFolder.resolve(GO_OBO_DATA).resolve(downloadFile.getOutputFile())); - saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(GO_OBO_DATA).resolve(getDataVersionFilename(GO_OBO_DATA))); + // DOID + Files.createDirectories(oboFolder.resolve(DOID_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, + oboFolder.resolve(DOID_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(DOID_OBO_DATA).resolve(downloadFile.getOutputFile())); + saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(DOID_OBO_DATA).resolve(getDataVersionFilename(DOID_OBO_DATA))); downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + // Mondo + Files.createDirectories(oboFolder.resolve(MONDO_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, + oboFolder.resolve(MONDO_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(MONDO_OBO_DATA).resolve(downloadFile.getOutputFile())); + saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(MONDO_OBO_DATA).resolve(getDataVersionFilename(MONDO_OBO_DATA))); + downloadFiles.add(downloadFile); } + // GO + Files.createDirectories(oboFolder.resolve(GO_OBO_DATA)); + downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder.resolve(GO_OBO_DATA)); + version = getVersionFromOboFile(oboFolder.resolve(GO_OBO_DATA).resolve(downloadFile.getOutputFile())); + saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(GO_OBO_DATA).resolve(getDataVersionFilename(GO_OBO_DATA))); + downloadFiles.add(downloadFile); + + logger.info(DOWNLOADING_DONE_MSG, getDataName(ONTOLOGY_DATA)); + return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 649c580493..5ffbf9ed93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -25,6 +25,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -38,36 +39,40 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException, CellBaseException { - List downloadFiles = new ArrayList<>(); - - // Check if the species has the data to download - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PHARMGKB_DATA)) { - logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PHARMACOGENOMICS_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(PHARMACOGENOMICS_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } - Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); - Files.createDirectories(pharmgkbDownloadFolder); + logger.info(CATEGORY_DOWNLOADING_MSG, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); - DownloadProperties.URLProperties pharmGKBConfig = configuration.getDownload().getPharmGKB(); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Files.createDirectories(pharmgkbDownloadFolder); - List urls = new ArrayList<>(); - for (String fileName : pharmGKBConfig.getFiles().values()) { - String url = pharmGKBConfig.getHost() + fileName; - urls.add(url); + DownloadProperties.URLProperties pharmGKBConfig = configuration.getDownload().getPharmGKB(); - Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); - DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); - logger.info(OK_LOG_MESSAGE); - downloadFiles.add(downloadFile); - } + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); - // Save data source - saveDataSource(PHARMGKB_DATA, pharmGKBConfig.getVersion(), getTimeStamp(), urls, - pharmgkbDownloadFolder.resolve(getDataVersionFilename(PHARMGKB_DATA))); + List urls = new ArrayList<>(); + for (String fileName : pharmGKBConfig.getFiles().values()) { + String url = pharmGKBConfig.getHost() + fileName; + urls.add(url); - logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); + Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_MSG, url, downloadedFilePath); + downloadFile = downloadFile(url, downloadedFilePath); + logger.info(OK_MSG); + downloadFiles.add(downloadFile); } + // Save data source + saveDataSource(PHARMGKB_DATA, pharmGKBConfig.getVersion(), getTimeStamp(), urls, + pharmgkbDownloadFolder.resolve(getDataVersionFilename(PHARMGKB_DATA))); + + logger.info(CATEGORY_DOWNLOADING_DONE_MSG, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); + return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 29719208fa..06347d12e2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -24,6 +24,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -44,56 +45,40 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws CellBaseException if there is an error in the CelllBase configuration file */ public List download() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PROTEIN_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(PROTEIN_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); + Files.createDirectories(proteinFolder); + + Path uniProtFolder = Files.createDirectories(proteinFolder.resolve(UNIPROT_DATA)); + Path interProFolder = Files.createDirectories(proteinFolder.resolve(INTERPRO_DATA)); + Path intactFolder = Files.createDirectories(proteinFolder.resolve(INTACT_DATA)); + + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); - // Check if the species is supported - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PROTEIN_DATA)) { - Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); - Files.createDirectories(proteinFolder); - - Path uniProtFolder = Files.createDirectories(proteinFolder.resolve(UNIPROT_DATA)); - Path interProFolder = Files.createDirectories(proteinFolder.resolve(INTERPRO_DATA)); - Path intactFolder = Files.createDirectories(proteinFolder.resolve(INTACT_DATA)); - - // Already downloaded ? - boolean downloadUniProt = !isAlreadyDownloaded(uniProtFolder.resolve(getDataVersionFilename(UNIPROT_DATA)), - getDataName(UNIPROT_DATA)); - boolean downloadInterPro = !isAlreadyDownloaded(interProFolder.resolve(getDataVersionFilename(INTERPRO_DATA)), - getDataName(INTERPRO_DATA)); - boolean downloadIntact = !isAlreadyDownloaded(intactFolder.resolve(getDataVersionFilename(INTACT_DATA)), - getDataName(INTACT_DATA)); - - if (!downloadUniProt && !downloadInterPro && !downloadIntact) { - return new ArrayList<>(); - } - - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); - - DownloadFile downloadFile; - - // Uniprot - if (downloadUniProt) { - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_DATA, - uniProtFolder); - downloadFiles.add(downloadFile); - } - - // InterPro - if (downloadInterPro) { - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_DATA, - interProFolder); - downloadFiles.add(downloadFile); - } - - // Intact - if (downloadIntact) { - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_DATA, - intactFolder); - downloadFiles.add(downloadFile); - } - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); - } + logger.info(DOWNLOADING_MSG, getDataName(PROTEIN_DATA)); + + // Uniprot + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_DATA, + uniProtFolder); + downloadFiles.add(downloadFile); + + // InterPro + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_DATA, + interProFolder); + downloadFiles.add(downloadFile); + + // Intact + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_DATA, + intactFolder); + downloadFiles.add(downloadFile); + + logger.info(DOWNLOADING_DONE_MSG, getDataName(PROTEIN_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 9006be7a7d..b73a752b10 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -19,6 +19,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; @@ -38,7 +39,13 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PUBMED_DATA)); + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), PUBMED_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(PUBMED_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + logger.info(DOWNLOADING_MSG, getDataName(PUBMED_DATA)); Path pubmedDownloadFolder = downloadFolder.resolve(PUBMED_DATA); Files.createDirectories(pubmedDownloadFolder); @@ -49,16 +56,16 @@ public List download() throws IOException, InterruptedException, C List downloadFiles = new ArrayList<>(); for (String filename : filenames) { String url = host + filename; - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedDownloadFolder.resolve(filename)); - downloadFiles.add(downloadFile(url, pubmedDownloadFolder.resolve(filename).toString())); - logger.info(OK_LOG_MESSAGE); + logger.info(DOWNLOADING_FROM_TO_MSG, url, pubmedDownloadFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, pubmedDownloadFolder.resolve(filename))); + logger.info(OK_MSG); } // Save data source saveDataSource(PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), Collections.singletonList(host), pubmedDownloadFolder.resolve(getDataVersionFilename(PUBMED_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(PUBMED_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 36b3aef688..ca5473cc68 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -24,6 +24,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -43,53 +44,31 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REGULATION_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(REGULATION_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + Path regulationFolder = downloadFolder.resolve(REGULATION_DATA); + Files.createDirectories(regulationFolder); + regulatoryBuildFolder = Files.createDirectories(regulationFolder.resolve(REGULATORY_BUILD_DATA)); + motifFeaturesFolder = Files.createDirectories(regulationFolder.resolve(MOTIF_FEATURES_DATA)); + mirTarBaseFolder = Files.createDirectories(regulationFolder.resolve(MIRTARBASE_DATA)); + mirBaseFolder = Files.createDirectories(regulationFolder.resolve(MIRBASE_DATA)); + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + List downloadFiles = new ArrayList<>(); - // Check if species is supported - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REGULATION_DATA)) { - Path regulationFolder = downloadFolder.resolve(REGULATION_DATA); - Files.createDirectories(regulationFolder); - regulatoryBuildFolder = Files.createDirectories(regulationFolder.resolve(REGULATORY_BUILD_DATA)); - motifFeaturesFolder = Files.createDirectories(regulationFolder.resolve(MOTIF_FEATURES_DATA)); - mirTarBaseFolder = Files.createDirectories(regulationFolder.resolve(MIRTARBASE_DATA)); - mirBaseFolder = Files.createDirectories(regulationFolder.resolve(MIRBASE_DATA)); - - String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - - // Already downloaded ? - boolean downloadRegulatoryBuild = !isAlreadyDownloaded(regulatoryBuildFolder.resolve(getDataVersionFilename( - REGULATORY_BUILD_DATA)), getDataName(REGULATORY_BUILD_DATA)); - boolean downloadMotifFeatures = !isAlreadyDownloaded(motifFeaturesFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)), - getDataName(MOTIF_FEATURES_DATA)); - boolean downloadMirTarBase = !isAlreadyDownloaded(mirTarBaseFolder.resolve(getDataVersionFilename(MIRTARBASE_DATA)), - getDataName(MIRTARBASE_DATA)) && configuration.getDownload().getMiRTarBase().getFiles().containsKey(prefixId - + MIRTARBASE_FILE_ID); - boolean downloadMirBase = !isAlreadyDownloaded(mirBaseFolder.resolve(getDataVersionFilename(MIRBASE_DATA)), - getDataName(MIRBASE_DATA)); - - if (!downloadRegulatoryBuild && !downloadMotifFeatures && !downloadMirTarBase && !downloadMirBase) { - return new ArrayList<>(); - } - - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); - - if (downloadRegulatoryBuild) { - downloadFiles.addAll(downloadRegulatoryaBuild()); - } - - if (downloadMotifFeatures) { - downloadFiles.addAll(downloadMotifFeatures()); - } - - if (downloadMirTarBase) { - downloadFiles.add(downloadMiRTarBase()); - } - if (downloadMirBase) { - downloadFiles.add(downloadMirna()); - } - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); - } + logger.info(DOWNLOADING_MSG, getDataName(REGULATION_DATA)); + + downloadFiles.addAll(downloadRegulatoryaBuild()); + downloadFiles.addAll(downloadMotifFeatures()); + downloadFiles.add(downloadMiRTarBase()); + downloadFiles.add(downloadMirna()); + + logger.info(DOWNLOADING_DONE_MSG, getDataName(REGULATION_DATA)); return downloadFiles; } @@ -100,7 +79,7 @@ public List download() throws IOException, InterruptedException, C * @throws InterruptedException Any issue downloading files */ private List downloadRegulatoryaBuild() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATORY_BUILD_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(REGULATORY_BUILD_DATA)); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); @@ -119,7 +98,7 @@ private List downloadRegulatoryaBuild() throws IOException, Interr * @throws InterruptedException Any issue downloading files */ private List downloadMotifFeatures() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MOTIF_FEATURES_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(MOTIF_FEATURES_DATA)); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); @@ -145,7 +124,7 @@ private List downloadMotifFeatures() throws IOException, Interrupt } private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRBASE_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(MIRBASE_DATA)); return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_DATA, mirBaseFolder); } @@ -154,7 +133,7 @@ private DownloadFile downloadMiRTarBase() throws IOException, InterruptedExcepti DownloadFile downloadFile = null; String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); if (configuration.getDownload().getMiRTarBase().getFiles().containsKey(prefixId + MIRTARBASE_FILE_ID)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(MIRTARBASE_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), prefixId + MIRTARBASE_FILE_ID, MIRTARBASE_DATA, mirTarBaseFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java index 77a8f160f7..b1c9fae975 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RepeatsDownloadManager.java @@ -42,77 +42,64 @@ public List download() throws IOException, InterruptedException, C } public List downloadRepeats() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REPEATS_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(REPEATS_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); + Files.createDirectories(repeatsFolder); + Path trfFolder = Files.createDirectories(repeatsFolder.resolve(TRF_DATA)); + Path wmFolder = Files.createDirectories(repeatsFolder.resolve(WM_DATA)); + Path gsdFolder = Files.createDirectories(repeatsFolder.resolve(GSD_DATA)); + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + String url; + Path outputPath; List downloadFiles = new ArrayList<>(); - // Check if species is supported - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), REPEATS_DATA)) { - - Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); - Files.createDirectories(repeatsFolder); - Path trfFolder = Files.createDirectories(repeatsFolder.resolve(TRF_DATA)); - Path wmFolder = Files.createDirectories(repeatsFolder.resolve(WM_DATA)); - Path gsdFolder = Files.createDirectories(repeatsFolder.resolve(GSD_DATA)); - - String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - - // Already downloaded ? - boolean downloadTrf = !isAlreadyDownloaded(trfFolder.resolve(getDataVersionFilename(TRF_DATA)), getDataName(TRF_DATA)) - && configuration.getDownload().getSimpleRepeats().getFiles().containsKey(prefixId + SIMPLE_REPEATS_FILE_ID); - boolean downloadWm = !isAlreadyDownloaded(wmFolder.resolve(getDataVersionFilename(WM_DATA)), getDataName(WM_DATA)) - && configuration.getDownload().getWindowMasker().getFiles().containsKey(prefixId + WINDOW_MASKER_FILE_ID); - boolean downloadGsd = !isAlreadyDownloaded(gsdFolder.resolve(getDataVersionFilename(GSD_DATA)), getDataName(GSD_DATA)) - && configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + GENOMIC_SUPER_DUPS_FILE_ID); - - if (!downloadTrf && !downloadWm && !downloadGsd) { - return new ArrayList<>(); - } - - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); - - // Download tandem repeat finder - if (downloadTrf) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(TRF_DATA)); - String url = configuration.getDownload().getSimpleRepeats().getHost() - + configuration.getDownload().getSimpleRepeats().getFiles().get(prefixId + SIMPLE_REPEATS_FILE_ID); - Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - - saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), - Collections.singletonList(url), trfFolder.resolve(getDataVersionFilename(TRF_DATA))); - } - - // Download WindowMasker - if (downloadWm) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(WM_DATA)); - String url = configuration.getDownload().getWindowMasker().getHost() - + configuration.getDownload().getWindowMasker().getFiles().get(prefixId + WINDOW_MASKER_FILE_ID); - Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - - saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), - Collections.singletonList(url), wmFolder.resolve(getDataVersionFilename(WM_DATA))); - } - - // Download genomic super duplications - if (downloadGsd) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GSD_DATA)); - String url = configuration.getDownload().getGenomicSuperDups().getHost() - + configuration.getDownload().getGenomicSuperDups().getFiles().get(prefixId + GENOMIC_SUPER_DUPS_FILE_ID); - Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - logger.info(OK_LOG_MESSAGE); - - saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), - Collections.singletonList(url), gsdFolder.resolve(getDataVersionFilename(GSD_DATA))); - } - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); - } + logger.info(DOWNLOADING_MSG, getDataName(REPEATS_DATA)); + + + // Download tandem repeat finder + logger.info(DOWNLOADING_MSG, getDataName(TRF_DATA)); + url = configuration.getDownload().getSimpleRepeats().getHost() + + configuration.getDownload().getSimpleRepeats().getFiles().get(prefixId + SIMPLE_REPEATS_FILE_ID); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_MSG, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath)); + logger.info(OK_MSG); + + saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), + Collections.singletonList(url), trfFolder.resolve(getDataVersionFilename(TRF_DATA))); + + // Download WindowMasker + logger.info(DOWNLOADING_MSG, getDataName(WM_DATA)); + url = configuration.getDownload().getWindowMasker().getHost() + + configuration.getDownload().getWindowMasker().getFiles().get(prefixId + WINDOW_MASKER_FILE_ID); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_MSG, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath)); + logger.info(OK_MSG); + + saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), + Collections.singletonList(url), wmFolder.resolve(getDataVersionFilename(WM_DATA))); + + // Download genomic super duplications + logger.info(DOWNLOADING_MSG, getDataName(GSD_DATA)); + url = configuration.getDownload().getGenomicSuperDups().getHost() + + configuration.getDownload().getGenomicSuperDups().getFiles().get(prefixId + GENOMIC_SUPER_DUPS_FILE_ID); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_MSG, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath)); + logger.info(OK_MSG); + + saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), + Collections.singletonList(url), gsdFolder.resolve(getDataVersionFilename(GSD_DATA))); + + logger.info(DOWNLOADING_DONE_MSG, getDataName(REPEATS_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java index 9f846f5cdb..f334ab2f00 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/SpliceScoreDownloadManager.java @@ -19,6 +19,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; @@ -37,14 +38,13 @@ public SpliceScoreDownloadManager(String species, String assembly, Path outdir, @Override public List download() throws IOException, InterruptedException, CellBaseException { - // Check if the species is supported - if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { - logger.info("{} not supported for the species {}", getDataName(SPLICE_SCORE_DATA), - speciesConfiguration.getScientificName()); + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), SPLICE_SCORE_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(SPLICE_SCORE_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); } - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(SPLICE_SCORE_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(SPLICE_SCORE_DATA)); // Create splice score directory Path spliceScorePath = downloadFolder.resolve(SPLICE_SCORE_DATA).toAbsolutePath(); @@ -56,7 +56,7 @@ public List download() throws IOException, InterruptedException, C // MMSplice saveSpliceScoreSource(MMSPLICE_DATA, configuration.getDownload().getMmSplice(), spliceScorePath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(SPLICE_SCORE_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(SPLICE_SCORE_DATA)); return Collections.emptyList(); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java index 24be1bdc98..b331ac5e41 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -25,6 +25,7 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -42,20 +43,21 @@ public List download() throws IOException, InterruptedException, C } public List downloadVariation() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(VARIATION_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + List downloadFiles = new ArrayList<>(); - // Check if species is supported - // and we do not need to download human variation data from Ensembl. It is already included in the CellBase. - if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA) - && !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + // For homo sapiens, we do not need to download human variation data from Ensembl because it has already been included + // in CellBase + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { Path variationFolder = downloadFolder.resolve(VARIATION_DATA); Files.createDirectories(variationFolder); - if (isAlreadyDownloaded(variationFolder.resolve(getDataVersionFilename(VARIATION_DATA)), getDataName(VARIATION_DATA))) { - return new ArrayList<>(); - } - - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA)); + logger.info(DOWNLOADING_MSG, getDataName(VARIATION_DATA)); DownloadFile downloadFile; String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); @@ -72,7 +74,7 @@ public List downloadVariation() throws IOException, InterruptedExc saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), urls, variationFolder.resolve(getDataVersionFilename(VARIATION_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(VARIATION_DATA)); + logger.info(DOWNLOADING_DONE_MSG, getDataName(VARIATION_DATA)); } return downloadFiles; } From a8d63681c506152edb81872a8cb760204095c982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Sat, 10 Aug 2024 08:23:16 +0200 Subject: [PATCH 146/148] core: add dbSNP in config file (removed after merging), #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 5 +++++ cellbase-core/src/test/resources/configuration.yml | 5 +++++ cellbase-lib/src/test/resources/configuration.test.yaml | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index f1cb6daee5..523de8842b 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -231,6 +231,11 @@ download: files: GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz + + dbSNP: + host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz + version: "156" + pharmGKB: host: https://api.pharmgkb.org/v1/download/file/data/ version: v1 diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 191d19e08c..8edc5d2581 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -235,6 +235,11 @@ download: files: GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz + + dbSNP: + host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz + version: "156" + pharmGKB: host: https://api.pharmgkb.org/v1/download/file/data/ version: v1 diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index cff46222d1..48b5261596 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -222,6 +222,11 @@ download: files: GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz + + dbSNP: + host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz + version: "156" + pharmGKB: host: https://api.pharmgkb.org/v1/download/file/data/ version: v1 From 162f34d013f9fb1daf7dceade3b23e7719a75116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 13 Aug 2024 11:13:40 +0200 Subject: [PATCH 147/148] add: improve species and assembly parameter descriptions, #TASK-5575, #TASK-5576, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index a44faeae7f..814f4f4559 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -19,12 +19,14 @@ import com.beust.jcommander.*; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; -import org.opencb.cellbase.lib.EtlCommons; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS; +import static org.opencb.cellbase.lib.EtlCommons.HSAPIENS; + public class AdminCliOptionsParser extends CliOptionsParser { @@ -43,6 +45,11 @@ public class AdminCliOptionsParser extends CliOptionsParser { private ServerCommandOptions serverCommandOptions; private ValidationCommandOptions validationCommandOptions; + private static final String SPECIES_DESCRIPTION = "Name of the species. For instance, valid formats include '" + HOMO_SAPIENS + + "' or '" + HSAPIENS + "'."; + private static final String ASSEMBLY_DESCRIPTION = "Name of the assembly, if empty the first assembly in configuration.json" + + " will be used."; + public AdminCliOptionsParser() { jCommander.setProgramName("cellbase-admin.sh"); commonCommandOptions = new CommonCommandOptions(); @@ -109,12 +116,10 @@ public class BuildCommandOptions { + " everything", required = true, arity = 1) public String data; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or" - + " 'hsapiens'", arity = 1) - public String species = "Homo sapiens"; + @Parameter(names = {"-s", "--species"}, description = SPECIES_DESCRIPTION, arity = 1) + public String species = HOMO_SAPIENS; - @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml" - + " will be used", arity = 1) + @Parameter(names = {"-a", "--assembly"}, description = ASSEMBLY_DESCRIPTION, arity = 1) public String assembly; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, @@ -142,9 +147,8 @@ public class DataListCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to list the data, valid formats include 'Homo sapiens'" - + " or 'hsapiens'", arity = 1) - public String species = "Homo sapiens"; + @Parameter(names = {"-s", "--species"}, description = SPECIES_DESCRIPTION, arity = 1) + public String species = HOMO_SAPIENS; } @Parameters(commandNames = {"data-release"}, commandDescription = "Manage data releases in order to support multiple versions of data") @@ -354,12 +358,10 @@ public class ValidationCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens'" - + " or 'hsapiens'", arity = 1) - public String species = "Homo sapiens"; + @Parameter(names = {"-s", "--species"}, description = SPECIES_DESCRIPTION, arity = 1) + public String species = HOMO_SAPIENS; - @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json" - + " will be used", arity = 1) + @Parameter(names = {"-a", "--assembly"}, description = ASSEMBLY_DESCRIPTION, arity = 1) public String assembly = "GRCh38"; @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter" From 344e92e08b32d3e0dd31a24ea608d5bf809cd792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 13 Aug 2024 17:05:45 +0200 Subject: [PATCH 148/148] test: fix JUnit tests by updating configuration files, #TASK-5564 --- .../opencb/cellbase/core/utils/DatabaseNameUtils.java | 2 +- cellbase-core/src/main/resources/configuration.yml | 2 +- cellbase-core/src/test/resources/configuration.yml | 10 ++++++---- .../lib/impl/core/MongoDBAdaptorFactoryTest.java | 2 +- .../src/test/resources/configuration.test.yaml | 10 ++++++---- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java index 12954e950f..f6976ae140 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java @@ -57,7 +57,7 @@ public static String cleanAssembly(String assembly) { throw new InvalidParameterException("Assembly is empty"); } - return assembly.replace("\\.", "") + return assembly.replace(".", "") .replace("-", "") .replace("_", "").toLowerCase(Locale.ROOT); } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 523de8842b..6b95da49ca 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -335,7 +335,7 @@ species: - variation - variation_functional_score - missense_variation_functional_score - - clinical_variant + - clinical_variants - splice_score - ontology - pubmed diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 8edc5d2581..27449e94d3 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -334,13 +334,15 @@ species: - conservation - repeats - gene - - regulation - protein - - clinical_variant - - missense_variation_functional_score - - ontology + - regulation + - variation - variation_functional_score + - missense_variation_functional_score + - clinical_variants - splice_score + - ontology + - pubmed - pharmacogenomics - id: mmusculus scientificName: Mus musculus diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java index b1c244a9b3..c680dabfd6 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactoryTest.java @@ -55,7 +55,7 @@ public void testGetDatabaseName() { assertThrows(InvalidParameterException.class, () -> DatabaseNameUtils.getDatabaseName("speciesName", null, cellBaseConfiguration.getVersion()), "Expected getDatabaseName() to throw an exception, but it didn't"); - assertTrue(thrown.getMessage().contains("Species and assembly are required")); + assertTrue(thrown.getMessage().contains("species and assembly are required")); // handle special characters databaseName = DatabaseNameUtils.getDatabaseName("speciesName", "my_funny.assembly--name", cellBaseConfiguration.getVersion()); diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index 48b5261596..1861b331ad 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -321,13 +321,15 @@ species: - conservation - repeats - gene - - regulation - protein - - clinical_variant - - missense_variation_functional_score - - ontology + - regulation + - variation - variation_functional_score + - missense_variation_functional_score + - clinical_variants - splice_score + - ontology + - pubmed - pharmacogenomics - id: mmusculus scientificName: Mus musculus