workshop_commands.sh

# The goal is to perform Differential Gene Expression analysis
# We will run through the workflow starting from raw files in FASTQ format all the way to generating gene list
# The input files are yeast RNAseq dataset from Schurch et al, 2016 study
# We will demonstrate the different workflow steps using a single file for concept
# Inlcuded in this binder are also commands to run all the steps on 6 files together using Snakefile

# Setting up your workspace
# Initialize conda
conda init

# Shorten the command prompt
echo "PS1='\w $ '" >> .bashrc

# Update for changes to take place
source .bashrc
## Note you could also restart the terminal for the same effect

# Create conda environment from YAML file
mamba env create -f rnaseq-env.yml

# Activate conda environment
conda activate rnaseq-qc-map

# Setting up project directory
# Create project directory
mkdir myproject

# Create data folder
mkdir myproject/raw_data

# Create reference data folder
mkdir myproject/ref_files

# Create fastqc folder for raw data
mkdir myproject/raw_data/fastqc

# Create trimmed data folder
mkdir myproject/trimmed

# Create fastqc folder for trimmed data
mkdir myproject/trimmed/fastqc

# Create folder to hold salmon results
mkdir myproject/quant

# View the project directory structure using tree
tree myproject/

# Download files
# Download data file
curl -L https://osf.io/5daup/download -o myproject/raw_data/ERR458493.fq.gz

# Download metadata file
curl -L https://osf.io/9c5jz/download -o myproject/raw_data/err_md5sum.txt

# Download the TruSeq2 primer set
curl -L https://raw.githubusercontent.com/timflutre/trimmomatic/master/adapters/TruSeq2-SE.fa -o myproject/ref_files/TruSeq2-SE.fa

# Download the yeast transcriptome
curl -L ftp://ftp.ensembl.org/pub/release-99/fasta/saccharomyces_cerevisiae/cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz -o myproject/ref_files/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz

## Exercise 1
# Use the tree command to determine if you have downloaded the correct files to the correct directories.
# Note that the first two files are in raw_data, but the second two are in ref_files. Why?

# Validation
# Check the integrity of downloaded raw data file
md5sum myproject/raw_data/ERR458493.fq.gz

# Compare with metdata file
head -n 1 myproject/raw_data/err_md5sum.txt
## Note you can use the first 7-8 characters of the hash for comparison

## Exercise 2
# Check that you have the correct hash (26e91cde337004233b7f43aff7102828) for the Saccharomyces_cerevisiae transcriptome

# Anatomy of FASTQ file
# Clear your terminal screen
clear

# Print out the first 4 lines of the FASTQ file
zcat myproject/raw_data/ERR458493.fq.gz | head -n 4

# Print out the last few lines of the FASTQ file
zcat myproject/raw_data/ERR458493.fq.gz | tail

# Count the number of lines in the FASTQ file
zcat myproject/raw_data/ERR458493.fq.gz | wc -l

## Exercise 3
# What can knowing the number of lines tell you?

# Caretaking
# Look at the permissions on the raw data files
ls -l myproject/raw_data/

# Update permissions on the raw data file to make it read only
chmod a-w myproject/raw_data/ERR458493.fq.gz

# Check updated permissions
ls -l myproject/raw_data/

## Exercise 4 (Bonus)
# Change the permissions of the md5 file.

## INSTRUCTOR SWITCH

# Quality Assurance
# Run fastqc on the raw data file
fastqc myproject/raw_data/ERR458493.fq.gz --outdir myproject/raw_data/fastqc

## Exercise 5
# Can you use tree to verify that your command ran successfully?

## Exercise 6
# Using the Files/Plots/Packages pane navigate to the files generated by fastqc and click on the `html` file to open it in another tab of your browser.

# Trimming
# Trim to remove adaptor from raw FASTQ file using trimmomatic
trimmomatic SE myproject/raw_data/ERR458493.fq.gz myproject/trimmed/ERR458493.qc.fq.gz ILLUMINACLIP:myproject/ref_files/TruSeq2-SE.fa:2:0:15 LEADING:2 TRAILING:2 SLIDINGWINDOW:4:2 MINLEN:25

# Quality Assurance II
# QC on trimmed file
fastqc myproject/trimmed/ERR458493.qc.fq.gz --outdir myproject/trimmed/fastqc/

## Exercise 7
# Compare the trimmed FastQC report to the original one. What has changed? Has it improved the data?

## Exercise 8
# We can test to see if increasing the trimming makes a large difference in this dataset.
# Re-run trimmomatic with leading and trailing minimum scores of 5, then run a FastQC report for your new trim. Be sure to name the output files something that will distinguish them from the ones you already have.
# Compare the fastqc report to the one with a minimum of 2. Has it improved the data?

# Running salmon
# Generate salmon index
salmon index --index myproject/quant/sc_ensembl_index --transcripts myproject/ref_files/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz

## Exercise 9
# What kinds of files does indexing make and what do they tell you?
# Either in the command line or the file browser check out one or two of the files that were created. Avoid the .bin files, these are in binary and won’t open. What is in the non-binary files? Are any of them useful? Report back in the chat if you find something that seems important.

# Generating salmon index including decoy file
salmon index --index myproject/quant/sc_ensembl_index --transcripts yeast_ref/Saccharomyces_cerevisiae_combined.fa -d  yeast_ref/decoys.txt

## Exercise 10
# Check for warnings. Look at the pre_indexing.log file and see what it says.

# Quantify reads with salmon
salmon quant -i myproject/quant/sc_ensembl_index --libType A -r myproject/trimmed/ERR458493.qc.fq.gz -o myproject/quant/ERR458493_quant --seqBias --gcBias --validateMappings

# Look at salmon output
ls myproject/quant/ERR458493_quant/

# Examine the contents of a individual salmon output file
less myproject/quant/ERR458493_quant/aux_info/meta_info.json

## Exercise 11
# The file we’re most interested in is the count file. It’s called quant.sf. Navigate to it and open it. What is in this file?

## Exercise 12
# What would you expect to happen if you ran Salmon on 2 sequencing files?

## Exercise 13
# Run Salmon on the more stringently trimmed version of your file.
# How did it go? In the zoom chat tell us whether more stringent trimming improved your mapping rate.

# Running with multiple files (BONUS)
snakemake -j 4 --use-conda

## Exercise 14
# Snakemake is working in a different folder than you were so it won’t overwrite all your hard work. Using one of the methods you’ve learned today, checkout the ‘rnaseq’ folder to see what the output for 6 files look like. Try to answer some of these questions:
# How does quality compare across the six samples?
# How do mapping rates compare across samples?
# The automated pipeline is coded to use a slightly different folder setup. Do you like one better than the other? How would you organize your workspace to make the most sense to you?