class: center, middle # Analysis Pipelines with UNIX tools --- # Pipelines * Multiple steps to achieve analyses * Often steps that have dependancies * Want to restart and not have to re-run everything * Use cluster queueing system (qsub) --- # Schematics ![PHYling pipeline](img/PHYling_pipeline.png) --- # Web-based pipelines * [Galaxy - https://usegalaxy.org/](https://usegalaxy.org/) * [Taverna - http://www.taverna.org.uk/](http://www.taverna.org.uk/) --- #GATK SNP calling - prepare data ![GATK1](img/GATK_1.png) --- #Shell scripts for pipelines: Data files Setup a script which lists the files to process. In my case, 3 columns: Strain name, Read 1, Read 2 ```text S1.ATCC42720 Samples/1_1_sequence.txt.gz Samples/1_2_sequence.txt.gz S2.U10A Samples/2_1_sequence.txt.gz Samples/2_2_sequence.txt.gz S3.U3G Samples/3_1_sequence.txt.gz Samples/3_2_sequence.txt.gz S4.U3B Samples/4_1_sequence.txt.gz Samples/4_2_sequence.txt.gz S5.U10D Samples/5_1_sequence.txt.gz Samples/5_2_sequence.txt.gz S6.U5C Samples/6_1_sequence.txt.gz Samples/6_2_sequence.txt.gz S7.U3E Samples/7_1_sequence.txt.gz Samples/7_2_sequence.txt.gz ... ``` --- #Script for alignment ```shell #PBS -l nodes=1:ppn=32,mem=24gb -j oe -N bwa module load bwa module load samtools module load java module load picard CPU=1 SAMPLEFILE=samples.info BWA=bwa GENOMEIDX=DBfolder/candida_lusitaniae_ATCC42720_w_CBS_6936_MT.fasta OUTPUT=bam QUAL=20 mkdir -p $OUTPUT if [ $PBS_NUM_PPN ]; then CPU=$PBS_NUM_PPN fi echo "CPU is $CPU" ``` --- #Script - Part 2 ```bash LINE=$PBS_ARRAYID if [ ! $LINE ]; then LINE=$1 #take from cmdline fi if [ ! $LINE ]; then echo "Need a number via PBS_ARRAYID or cmdline" exit fi ROW=`head -n $LINE $SAMPLEFILE | tail -n 1` SAMPLE=`echo "$ROW" | awk '{print $1}'` READ1=`echo "$ROW" | awk '{print $2}'` READ2=`echo "$ROW" | awk '{print $3}'` INDIR=`dirname $READ1` ``` --- #Script - Part 3 - Actual Work ```bash if [ ! -f $OUTPUT/$SAMPLE.DD.bam ]; then # Do the alignment if file doesn't exist if [ ! -f $OUTPUT/$SAMPLE.sam ]; then $BWA mem -t $CPU $GENOMEIDX $INDIR/$SAMPLE.1.fq \ $INDIR/$SAMPLE.2.fq > $OUTPUT/$SAMPLE.sam fi # Fix the Read Groups for the alignment if [ ! -f $OUTPUT/$SAMPLE.RG.bam ]; then java -jar $PICARD AddOrReplaceReadGroups I=$OUTPUT/$SAMPLE.sam \ O=$OUTPUT/$SAMPLE.RG.bam RGLB=$SAMPLE RGID=$SAMPLE RGSM=$SAMPLE \ RGPL=illumina RGCN=UCR_Cofactor \ RGDS="$SAMPLE.1.fq $SAMPLE.2.fq" CREATE_INDEX=true SO=coordinate # mark Duplicates files java -jar $PICARD MarkDuplicates I=$OUTPUT/$SAMPLE.RG.bam \ O=$OUTPUT/$SAMPLE.DD.bam METRICS_FILE=$SAMPLE.dedup.metrics \ CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT fi ``` --- #Do Realignment ```bash # Make index if [ ! -f $BAMDIR/$ROW.DD.bai ]; then java -jar $PICARD BuildBamIndex I=$BAMDIR/$ROW.DD.bam fi if [ ! -f $BAMDIR/$ROW.intervals ]; then java -Xmx$MEM -jar $GATK \ -T RealignerTargetCreator \ -R $GENOMEIDX \ -I $BAMDIR/$ROW.DD.bam \ -o $BAMDIR/$ROW.intervals fi if [ ! -f $BAMDIR/$ROW.realign.bam ]; then java -Xmx$MEM -jar $GATK \ -T IndelRealigner \ -R $GENOMEIDX \ -I $BAMDIR/$ROW.DD.bam \ -targetIntervals $BAMDIR/$ROW.intervals \ -o $BAMDIR/$ROW.realign.bam fi ``` --- #GATK SNP calling ![GATK2](img/GATK_2.png) --- #HaplotypeCaller with GATK ```bash N=`ls $INDIR/*.realign.bam | sed -n ${PBS_ARRAYID}p` O=`basename $N .realign.bam` if [ ! -f $OUTDIR/$O.g.vcf ]; then java -Xmx32g -jar $GATK \ -T HaplotypeCaller \ -ERC GVCF \ -ploidy 2 \ -I $N -R $GENOME \ -o $OUTDIR/$O.g.vcf -nct $CPU fi ``` --- #HaplotypeCaller joint caller [GATK Joint Caller](https://www.broadinstitute.org/gatk/guide/article?id=2803) ```bash module load java module load GATK INDIR=Variants/ATCC_MTfix OUT=Variants/C_lus.ATCCMTfix_noRef.vcf N=`ls $INDIR/*.g.vcf | grep -v 'ATCC42720' | \ perl -p -e 's/\n/ /; s/(\S+)/-V $1/'` java -Xmx$MEM -jar $GATK \ -T GenotypeGVCFs \ -R $GENOME \ $N \ -o $OUT \ -nt $CPU ```