Variant calling and annotation: Practical session


#!/bin/bash
 
# Working directory
 
cd $HOME/mda13/calling
 
# Checking the reference
 
head $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta
 
head -3000 $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta | tail
 
#Checking the bed file
 
head $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed
 
 
# IGV
igv &
 
####################################################################################
 
################    SNV Calling
 
####################################################################################
 
###### GATK
 
# Run program and see options
java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar
 
# UnifiedGenotyper
 
java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\
 -T UnifiedGenotyper
 
 
# Executing SNVs calling of variants
java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\
 -T UnifiedGenotyper\
 -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\
 -L $HOME/mda13/calling/resources//Exon_50mb_hg19_chr20.bed\
 -I $HOME/mda13/calling/mapping/test_final.bam\
 -glm SNP \
 -o snvs_raw_gatk.vcf
 
 
# Checking file
less snvs_raw_gatk.vcf
 
# Counting lines
du -hs snvs_raw_gatk.vcf
wc -l snvs_raw_gatk.vcf
 
 
# Labeling VCF files
# VariantFiltration
$HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar \
-T VariantFiltration
 
# Labeling SNVs VCF file
java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\
 -T VariantFiltration\
 -filter "QD < 2.0"\
 -filterName "QD"\
 -filter "MQ < 40.0"\
 -filterName "MQ"\
 -filter "FS > 60.0"\
 -filterName "FS"\
 -filter "HaplotypeScore > 13.0"\
 -filterName "HaplotypeScore"\
 -filter "MQRankSum < -12.5"\
 -filterName "MQRankSum"\
 -filter "ReadPosRankSum < -8.0"\
 -filterName "ReadPosRankSum"\
 -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\
 -V snvs_raw_gatk.vcf\
 -o snvs_raw_gatk_labeled.vcf
 
# Checking files
wc -l snvs_raw_gatk.vcf
wc -l snvs_raw_gatk_labeled.vcf
grep PASS snvs_raw_gatk_labeled.vcf | wc -l
 
 
 
###### Samtools
 
# Run program and see options
$HOME/mda13/calling/software/samtools/samtools
 
# UnifiedGenotyper
 
$HOME/mda13/calling/software/samtools/samtools mpileup
 
# Variant calling
$HOME/mda13/calling/software/samtools/samtools mpileup\
 -u -C 50 -l $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed\
 -f $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\
 $HOME/mda13/calling/mapping/test_final.bam |\
 $HOME/mda13/calling/software/samtools/bcftools/bcftools view -bvcg - >\
 raw_samtools.bcf
 
# Converting to VCF format of snvs
$HOME/mda13/calling/software/samtools/bcftools/bcftools view\
 raw_samtools.bcf |\
 grep -v INDEL > snvs_raw_samtools.vcf
 
# Filtering variants
$HOME/mda13/calling/software/samtools/bcftools/vcfutils.pl varFilter\
 snvs_raw_samtools.vcf >\
 snvs_raw_samtools_filtered.vcf
 
 
 
#Check positions:
# Variants detected by GATK but not by Samtools
20:238507
20:590542
20:590543
20:865896
20:1291076
20:1517885
20:1895755
20:2413125
20:2539387
20:3025514
20:3657803
20:3657804
 
 
# Variants detected by Samtools but not by GATK
20:397928
20:944649
20:1209155
20:2380332
20:2380396
20:3650205
20:3734514
20:4680208
20:13060543
20:18123556
20:18511456
20:21142813
20:23016692
20:23729722
 
 
 
####################################################################################
 
################    Indel Calling
 
####################################################################################
 
###### GATK
 
 
# Executing indels calling of variants
java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\
 -T UnifiedGenotyper \
 -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\
 -L $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed\
 -I $HOME/mda13/calling/mapping/test_final.bam\
 -glm INDEL\
 -o indels_raw_gatk.vcf
 
# Checking file
less indels_raw_gatk.vcf
 
# Counting lines
du -hs indels_raw_gatk.vcf
wc -l indels_raw_gatk.vcf
 
# Labeling indels VCF file
java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\
 -T VariantFiltration\
 -filter "QD < 2.0"\
 -filterName "QD"\
 -filter "FS > 200.0"\
 -filterName "FS"\
 -filter "ReadPosRankSum < -20.0"\
 -filterName "ReadPosRankSum"\
 -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\
 -V indels_raw_gatk.vcf\
 -o indels_raw_gatk_labeled.vcf
 
# Checking files
wc -l indels_raw_gatk.vcf
wc -l indels_raw_gatk_labeled.vcf
grep PASS indels_raw_gatk_labeled.vcf | wc -l
 
 
############################## Samtools
 
# Indel calling
 
$HOME/mda13/calling/software/samtools/bcftools/bcftools view\
 raw_samtools.bcf |\
 grep INDEL > indels_raw_samtools.vcf
 
# Filtering indels
$HOME/mda13/calling/software/samtools/bcftools/vcfutils.pl varFilter\
 indels_raw_samtools.vcf >\
 indels_raw_samtools_filtered.vcf
 
 
#Check positions:
# Indels detected by GATK but not by Samtools
20:10622080
20:29632674
20:31292298
20:31384616
20:31427484
20:35695535
20:42249479
20:50008006
20:50015300
20:50713652
20:50768672
20:57828914
20:60294246 <---- homopolymer
20:61167632 <---- homopolymer
 
 
 
# Indels detected by Samtools but not by GATK
20:944691
20:3052951
20:29449623
20:29623248
20:36149747
20:40074338
20:57828924
20:60737994
 
############################# Dindel
 
mkdir dindel_output
 
# Stage 1
 
$HOME/mda13/calling/software/dindel/dindel --analysis getCIGARindels\
 --bamFile $HOME/mda13/calling/mapping/test_final.bam\
 --outputFile dindel_output/test_dindel\
 --ref $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta
 
# Stage 2
$HOME/mda13/calling/software/dindel/makeWindows.py\
 --inputVarFile dindel_output/test_dindel.variants.txt\
 --windowFilePrefix dindel_output/test_realign_windows\
 --numWindowsPerFile 1000
 
# Stage 3
for varfile in dindel_output/*_realign_windows.*.txt
    do number=`basename $varfile .txt | sed 's/^.*_realign_windows.//'`
 
    $HOME/mda13/calling/software/dindel/dindel --analysis indels --doDiploid\
 --bamFile $HOME/mda13/calling/mapping/test_final.bam\
 --ref $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\
 --varFile $varfile\
 --libFile dindel_output/test_dindel.libraries.txt\
 --outputFile dindel_output/test_dindel_stage2_output_windows.$number
done
 
# Stage 4 ----------- Editing third line of mergeOutputDiploid.py script with utils path
# Set path to $HOME/mda13/calling/software/dindel/utils
 
ls dindel_output/*.glf.txt > dindel_output/test_glfFiles.txt
 
$HOME/mda13/calling/software/dindel/mergeOutputDiploid.py\
 -i dindel_output/test_glfFiles.txt\
 -o indels_raw_dindel.vcf\
 -s test\
 -r $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta
 
 
# Stage 5
# Select PASS indels
grep PASS indels_raw_dindel.vcf > indels_raw_dindel_filtered.vcf
 
# Indels detected only by dindel
20:271135
20:334317
20:865664
20:2291164
20:2411731
20:2797764 
20:3212222
20:3674821 <----- homopolymer
20:3838205
20:3928844 <----- homopolymer
20:4768475
20:4778799
20:5086720 <----- homopolymer
20:5556686
20:5948169
20:5948421
20:29633966 <----- homopolymer
20:29647107
20:30354257
20:31291919
20:31292184