Pipelines

class: center, middle

# Analysis Pipelines with UNIX tools

---
      # Pipelines
      
      * Multiple steps to achieve analyses
      * Often steps that have dependancies
      * Want to restart and not have to re-run everything
      * Use cluster queueing system (qsub)

---
      # Schematics

![PHYling pipeline](img/PHYling_pipeline.png)
            
      ---
      # Web-based pipelines

* [Galaxy - https://usegalaxy.org/](https://usegalaxy.org/)

* [Taverna - http://www.taverna.org.uk/](http://www.taverna.org.uk/)

---
      #GATK SNP calling - prepare data

![GATK1](img/GATK_1.png)
      
      ---
      #Shell scripts for pipelines: Data files
      Setup a script which lists the files to process. In my case, 3
      columns: Strain name, Read 1, Read 2
      ```text
      S1.ATCC42720 Samples/1_1_sequence.txt.gz Samples/1_2_sequence.txt.gz
      S2.U10A Samples/2_1_sequence.txt.gz Samples/2_2_sequence.txt.gz
      S3.U3G Samples/3_1_sequence.txt.gz Samples/3_2_sequence.txt.gz
      S4.U3B Samples/4_1_sequence.txt.gz Samples/4_2_sequence.txt.gz
      S5.U10D Samples/5_1_sequence.txt.gz Samples/5_2_sequence.txt.gz
      S6.U5C Samples/6_1_sequence.txt.gz Samples/6_2_sequence.txt.gz
      S7.U3E Samples/7_1_sequence.txt.gz Samples/7_2_sequence.txt.gz
      ...
      ```

---
      #Script for alignment
      ```shell
      #PBS  -l nodes=1:ppn=32,mem=24gb -j oe -N bwa
      module load bwa
      module load samtools
      module load java
      module load picard

CPU=1
      SAMPLEFILE=samples.info
      BWA=bwa
      GENOMEIDX=DBfolder/candida_lusitaniae_ATCC42720_w_CBS_6936_MT.fasta
      OUTPUT=bam
      QUAL=20

mkdir -p $OUTPUT

if [ $PBS_NUM_PPN ]; then
         CPU=$PBS_NUM_PPN
      fi
      echo "CPU is $CPU"
      ```
      ---
      #Script - Part 2
      ```bash
      LINE=$PBS_ARRAYID
      if [ ! $LINE ]; then
         LINE=$1  #take from cmdline
      fi

if [ ! $LINE ]; then
         echo "Need a number via PBS_ARRAYID or cmdline"
         exit
      fi

ROW=`head -n $LINE $SAMPLEFILE | tail -n 1`
      SAMPLE=`echo "$ROW" | awk '{print $1}'`
      READ1=`echo "$ROW" | awk '{print $2}'`
      READ2=`echo "$ROW" | awk '{print $3}'`
      INDIR=`dirname $READ1`
      ```

---
      #Script - Part 3 - Actual Work
      ```bash
      if [ ! -f $OUTPUT/$SAMPLE.DD.bam ]; then

# Do the alignment if file doesn't exist
       if [ ! -f $OUTPUT/$SAMPLE.sam ]; then
         $BWA mem -t $CPU $GENOMEIDX $INDIR/$SAMPLE.1.fq \
         $INDIR/$SAMPLE.2.fq > $OUTPUT/$SAMPLE.sam
       fi

# Fix the Read Groups for the alignment
       if [ ! -f $OUTPUT/$SAMPLE.RG.bam ]; then
         java -jar $PICARD AddOrReplaceReadGroups I=$OUTPUT/$SAMPLE.sam \
       O=$OUTPUT/$SAMPLE.RG.bam RGLB=$SAMPLE RGID=$SAMPLE RGSM=$SAMPLE \
       RGPL=illumina RGCN=UCR_Cofactor \
       RGDS="$SAMPLE.1.fq $SAMPLE.2.fq" CREATE_INDEX=true SO=coordinate

# mark Duplicates files
      java -jar $PICARD MarkDuplicates I=$OUTPUT/$SAMPLE.RG.bam \
      O=$OUTPUT/$SAMPLE.DD.bam METRICS_FILE=$SAMPLE.dedup.metrics \
      CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT

fi
      ```

---
      #Do Realignment
      
      ```bash
      # Make index
      if [ ! -f $BAMDIR/$ROW.DD.bai ]; then
        java -jar $PICARD BuildBamIndex I=$BAMDIR/$ROW.DD.bam
      fi

if [ ! -f $BAMDIR/$ROW.intervals ]; then
       java -Xmx$MEM -jar $GATK \
       -T RealignerTargetCreator \
       -R $GENOMEIDX \
       -I $BAMDIR/$ROW.DD.bam \
       -o $BAMDIR/$ROW.intervals
      fi

if [ ! -f $BAMDIR/$ROW.realign.bam ]; then
        java -Xmx$MEM -jar $GATK \
        -T IndelRealigner \
        -R $GENOMEIDX \
        -I $BAMDIR/$ROW.DD.bam \
        -targetIntervals $BAMDIR/$ROW.intervals \
        -o $BAMDIR/$ROW.realign.bam
      fi
      ```

---
      #GATK SNP calling

![GATK2](img/GATK_2.png)

---
      #HaplotypeCaller with GATK

```bash

N=`ls $INDIR/*.realign.bam | sed -n ${PBS_ARRAYID}p`
      O=`basename $N .realign.bam`
      
      if [ ! -f $OUTDIR/$O.g.vcf ]; then
        java -Xmx32g -jar $GATK \
        -T HaplotypeCaller \
        -ERC GVCF \        
        -ploidy 2 \
        -I $N -R $GENOME \
        -o $OUTDIR/$O.g.vcf -nct $CPU
      fi
      ```

---
      #HaplotypeCaller joint caller

[GATK Joint Caller](https://www.broadinstitute.org/gatk/guide/article?id=2803)

```bash
      module load java
      module load GATK
      INDIR=Variants/ATCC_MTfix
      OUT=Variants/C_lus.ATCCMTfix_noRef.vcf

N=`ls $INDIR/*.g.vcf | grep -v 'ATCC42720' | \
       perl -p -e 's/\n/ /; s/(\S+)/-V $1/'`

java -Xmx$MEM -jar $GATK \
      -T GenotypeGVCFs \
      -R $GENOME \
      $N \
      -o $OUT \
      -nt $CPU
      ```