Merge branch 'release-6.2.x' into TASK-6515

opencb · Jul 30, 2024 · 1c1d8a2 · 1c1d8a2
2 parents b52c83d + 386a510
commit 1c1d8a2
Show file tree

Hide file tree

Showing 22 changed files with 499 additions and 76 deletions.
diff --git a/.github/workflows/manual-test.yml b/.github/workflows/manual-test.yml
@@ -0,0 +1,54 @@
+name: Manual Junit test the project
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Opencga branch to run the tests'
+        default: 'develop'
+        required: true
+      fail-never:
+        type: boolean
+        description: 'The process executes all tests even if some fail.'
+        default: false
+        required: false
+
+jobs:
+  test:
+    name: JUnit Test
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          fetch-depth: '0'
+      - name: Set up JDK 11
+        uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+          cache: 'maven'
+      - name: K8s Tunnel MongoDB
+        run: |
+          wget https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl
+          chmod +x ./kubectl
+          echo "${{ secrets.AZURE_KUBE_CONFIG }}" > admin.conf 
+          ./kubectl -n cellbase-db port-forward services/cellbase-rs0-svc 27017:27017 --kubeconfig ./admin.conf &
+      - name: Install dependencies branches
+        run: |
+          if [ -f "./.github/workflows/scripts/get_same_branch.sh" ]; then
+            chmod +x ./.github/workflows/scripts/get_same_branch.sh
+            ./.github/workflows/scripts/get_same_branch.sh ${{ github.ref_name }}
+          else
+             echo "./.github/workflows/scripts/get_same_branch.sh does not exist."
+          fi
+      - name: Test and Analyze
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
+        run: |
+          FAIL_NEVER=""
+          if [ "${{ github.event.inputs.fail-never }}" == "true" ]; then
+            FAIL_NEVER="--fail-never"
+          fi
+          mvn install surefire-report:report ${FAIL_NEVER} -Dcheckstyle.skip
diff --git a/.github/workflows/pull-request-approved.yml b/.github/workflows/pull-request-approved.yml
@@ -0,0 +1,15 @@
+name: Pull request approve workflow
+
+on:
+  pull_request_review:
+    types: [ submitted ]
+
+jobs:
+  build:
+    uses: opencb/java-common-libs/.github/workflows/build-java-app-workflow.yml@develop
+
+  test:
+    name: "Test analysis"
+    uses: ./.github/workflows/test-analysis.yml
+    needs: build
+    secrets: inherit
diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
@@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \
     git clone https://github.com/Ensembl/ensembl-compara.git && \
     git clone https://github.com/Ensembl/ensembl-io.git
 
-ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase
+ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts
diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
@@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157";
 our $ENSEMBL_GENOMES_USER = "anonymous";
 
 ## Vertebrates
-our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38";
-our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38";
-our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38";
-our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38";
+our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38";
+our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38";
+our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38";
+our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38";
 #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
 #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
 #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";

diff --git a/cellbase-app/app/scripts/gnomad/mitochondrial/README.md b/cellbase-app/app/scripts/gnomad/mitochondrial/README.md
@@ -0,0 +1,10 @@
+gnomAD Mitochondrial DNA (mtDNA) variants v3.1:
+URL: https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1/vcf/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz
+
+Mapping file in ticket BIOINFO-99: mapping_file_gnomad_mt_mod_file.txt
+
+Script to preprocess original VCF from gnomad: gnomad_mt.py
+
+Script to load gnomad mt variants into OpenCGA and export them in json format annotation.populationFrequencies object: opencga_gnomad_mt.sh
+
+
diff --git a/cellbase-app/app/scripts/gnomad/mitochondrial/gnomad_mt.py b/cellbase-app/app/scripts/gnomad/mitochondrial/gnomad_mt.py
@@ -0,0 +1,120 @@
+import sys
+import gzip
+
+
+POPULATIONS = ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid']
+HEADER_COMMON = [
+    '##INFO=<ID=AC,Number=1,Type=Integer,Description="Calculated allele count">',
+    '##INFO=<ID=AF,Number=1,Type=Float,Description="Calculated allele frequency">',
+    '##INFO=<ID=GTC,Number=1,Type=String,Description="Calculated list of genotype counts (0/0,0/1,1/1)">'
+]
+HEADER_POP = [
+    '##INFO=<ID=AF_{pop},Number=1,Type=Float,Description="Calculated allele frequency for {pop} population">',
+    '##INFO=<ID=AC_{pop},Number=1,Type=Integer,Description="Calculated allele count for {pop} population">',
+    '##INFO=<ID=AN_{pop},Number=1,Type=Integer,Description="Calculated allele number for {pop} population">',
+    '##INFO=<ID=GTC_{pop},Number=1,Type=String,Description="Calculated list of genotype counts for {pop} population (0/0,0/1,1/1)">'
+]
+
+
+def main():
+
+    # Creating custom header
+    custom_header = []
+    custom_header += HEADER_COMMON
+    for pop in POPULATIONS:
+        custom_header += ['\n'.join(HEADER_POP).format(pop=pop)]
+    custom_header = '\n'.join(custom_header) + '\n'
+
+    # Opening input/output files
+    vcf_input_fpath = sys.argv[1]
+    vcf_output_fpath = sys.argv[2]
+    vcf_input_fhand = gzip.open(vcf_input_fpath, 'r')
+    vcf_output_fhand = gzip.open(vcf_output_fpath, 'wt')
+
+    # Calculating new INFO fields for each variant
+    for line in vcf_input_fhand:
+        line = line.decode()
+
+        # Writing header to output
+        if line.startswith('##VEP'):  # adding custom header before "##VEP" line
+            vcf_output_fhand.write(custom_header)
+            vcf_output_fhand.write(line)
+            continue
+        if line.startswith('#'):
+            vcf_output_fhand.write(line)
+            continue
+
+        # Dict to store the new calculated data
+        new_info = {}
+
+        # Getting variant and INFO data
+        variant_items = line.strip().split()
+        info_items = variant_items[7].split(';')
+
+        for info_item in info_items:
+
+            # Getting key/value for each INFO item
+            if len(info_item.split('=', maxsplit=1)) < 2:  # skipping flags
+                continue
+            info_key, info_value = info_item.split('=', maxsplit=1)
+
+            # Getting INFO data for calculations
+            if info_key == 'pop_AF_hom':
+                pop_AF_hom = list(map(float, info_value.split('|')))
+            if info_key == 'pop_AF_het':
+                pop_AF_het = list(map(float, info_value.split('|')))
+            if info_key == 'AF_hom':
+                AF_hom = float(info_value)
+            if info_key == 'AF_het':
+                AF_het = float(info_value)
+            if info_key == 'pop_AC_hom':
+                pop_AC_hom = list(map(int, info_value.split('|')))
+            if info_key == 'pop_AC_het':
+                pop_AC_het = list(map(int, info_value.split('|')))
+            if info_key == 'AC_hom':
+                AC_hom = int(info_value)
+            if info_key == 'AC_het':
+                AC_het = int(info_value)
+            if info_key == 'pop_AN':
+                pop_AN = list(map(int, info_value.split('|')))
+            if info_key == 'AN':
+                AN = int(info_value)
+
+        # Calculating AF_{pop} and AF
+        # e.g. AF_sas = pop_AF_hom[i] + pop_AF_het[i] (i = index of sas population)
+        pop_AF = [x + y for x, y in zip(pop_AF_hom, pop_AF_het)]
+        for i, pop in enumerate(POPULATIONS):
+            new_info['AF_' + pop] = pop_AF[i]
+        new_info['AF'] = AF_hom + AF_het
+
+        # Calculating AC_{pop} and AC
+        # e.g. AC_sas = pop_AC_hom[i] + pop_AC_het[i] (i = index of sas population)
+        pop_AC = [x + y for x, y in zip(pop_AC_hom, pop_AC_het)]
+        for i, pop in enumerate(POPULATIONS):
+            new_info['AC_' + pop] = pop_AC[i]
+        new_info['AC'] = AC_hom + AC_het
+
+        # Calculating AN_{pop}
+        # e.g. AN_sas = pop_AN[i] (i = index of sas population)
+        for i, pop in enumerate(POPULATIONS):
+            new_info['AN_' + pop] = pop_AN[i]
+
+        # Calculating GTC_{pop}
+        # e.g. GTC_sas = (pop_AN[i] - (pop_AC_het[i] + pop_AC_hom[i])) + "," + pop_AC_het[i] + "," + pop_AC_hom[i]
+        pop_AC = [x + y for x, y in zip(pop_AC_hom, pop_AC_het)]
+        hom_ref = [x - y for x, y in zip(pop_AN, pop_AC)]
+        for i, pop in enumerate(POPULATIONS):
+            new_info['GTC_' + pop] = ','.join(map(str, [hom_ref[i], pop_AC_het[i], pop_AC_hom[i]]))
+        new_info['GTC'] = ','.join(map(str, [AN - (AC_hom + AC_het), AC_het, AC_hom]))
+
+        # Joining existing INFO field and new custom INFO data
+        custom_info_data = ';'.join(['='.join([k, str(new_info[k])]) for k in new_info])
+        new_info_field = ';'.join(info_items + [custom_info_data])
+
+        # Replacing original INFO field
+        variant_items[7] = new_info_field
+        vcf_output_fhand.write('\t'.join(variant_items) + '\n')
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/cellbase-app/app/scripts/gnomad/mitochondrial/opencga_gnomad_mt.sh b/cellbase-app/app/scripts/gnomad/mitochondrial/opencga_gnomad_mt.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Variables
+user="user"
+host="host_name"
+project="population"
+project_name="Population"
+study="gnomad_mt"
+study_name="gnomAD v3.1 Mitocondrial DNA Variants"
+study_path="data/"$study
+folder_path="/home/gnomad_mt"
+mapping_file="mapping_file_gnomad_mt_mod_file.txt"
+vcf_file="gnomad.genomes.v3.1.sites.chrM.mod.vcf.gz"
+mapping_file_path=$folder_path$mapping_file
+vcf_file_path=$folder_path$vcf_file
+
+# Login
+/home/opencga-client-2.12.0/bin/opencga.sh login $user --host $host
+
+# Project creation
+/home/opencga-client-2.12.0/bin/opencga.sh projects create --id $project --name $project_name --organism-scientific-name hsapiens --organism-assembly grch38 --host $host
+
+# Study creation
+/home/opencga-client-2.12.0/bin/opencga.sh studies create --id $study --name $study_name --project $project --host $host
+
+# Folders creation within Catalog
+/home/opencga-client-2.12.0/bin/opencga.sh files create --path $study_path --parents --study $study --type DIRECTORY --host $host
+
+# Uploading gnomad mt variants VCF and mapping file for gnomad mt variants
+/home/opencga-client-2.12.0/bin/opencga.sh files upload -i $mapping_file_path --path $study_path --study $study --host $host
+
+/home/opencga-client-2.12.0/bin/opencga.sh files upload -i $vcf_file_path --path $study_path --study $study --host $host
+
+# Variant index for gnomad mt variants VCF
+/home/opencga-client-2.12.0/bin/opencga.sh operations variant-index --study $study --file $vcf_file --load-archive NO --load-split-data CHROMOSOME --host $host
+
+# Variant stats index for gnomad mt variants. The corresponding cohorts and variant cohort stats will be generated using the information of interest provided in the mapping file and INFO column of the gnomad mt VCF
+/home/opencga-client-2.12.0/bin/opencga.sh operations variant-stats-index --study $study --aggregation-mapping-file $mapping_file --aggregated BASIC --host $host
+
+# Variant cohort stats will be converted to population frequencies data model (julie-tool)
+/home/opencga-client-2.12.0/bin/opencga.sh operations variant-julie-run --project $project --host $host
+
+# Export of annotation.populationFrequencies in json format
+/home/opencga-client-2.12.0/bin/opencga.sh variant export-run --body_include annotation.populationFrequencies --body_project $project --project $project --output-file-format json --host $host
diff --git a/cellbase-app/app/scripts/gnomad_mt_prepare.py b/cellbase-app/app/scripts/gnomad_mt_prepare.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+#  Copyright 2015-2020 OpenCB
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import argparse
+import os
+import requests
+import sys
+import json
+import pathlib
+from pathlib import Path
+
+
+## Configure command-line options
+parser = argparse.ArgumentParser()
+parser.add_argument('-i', help="VCF file", required=True)
+
+
+## Parse command-line parameters and init basedir, tag and build_folder
+args = parser.parse_args()
+print(args.i)
+
+if os.path.isfile(args.i) == False:
+    print("no existe")
+
+
+# Opening file
+vcf_file = open(args.i, 'r')
+count = 0
+
+# Using for loop
+print("Using for loop")
+for line in vcf_file:
+    count += 1
+    if not line.startswith("#"):
+        line = line.strip()
+        cols = line.split("\t")
+        print(line)
+        info_cols = cols[7].split(";")
+        var = [x for x in info_cols if x.startswith("AN=")]
+        print("{}".format(var))
+
+
+# Closing files
+vcf_file.close()
diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java
@@ -26,6 +26,7 @@ public class DownloadProperties {
     private EnsemblProperties ensembl;
     private EnsemblProperties ensemblGenomes;
     private URLProperties hgnc;
+    private URLProperties cancerHotspot;
     private URLProperties refSeq;
     private URLProperties refSeqFasta;
     private URLProperties refSeqProteinFasta;
@@ -71,6 +72,7 @@ public class DownloadProperties {
     private URLProperties hpoObo;
     private URLProperties goObo;
     private URLProperties doidObo;
+    private URLProperties mondoObo;
     private URLProperties goAnnotation;
     private URLProperties revel;
     private URLProperties pubmed;
@@ -527,6 +529,24 @@ public DownloadProperties setHgnc(URLProperties hgnc) {
         return this;
     }
 
+    public URLProperties getCancerHotspot() {
+        return cancerHotspot;
+    }
+
+    public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) {
+        this.cancerHotspot = cancerHotspot;
+        return this;
+    }
+
+    public URLProperties getMondoObo() {
+        return mondoObo;
+    }
+
+    public DownloadProperties setMondoObo(URLProperties mondoObo) {
+        this.mondoObo = mondoObo;
+        return this;
+    }
+
     public static class EnsemblProperties {
 
         private DatabaseCredentials database;