COURSE OVERVIEW
LOGISTICS
LAST COURSE PHOTOS
#!/bin/bash # Working directory cd $HOME/mda13/calling # Checking the reference head $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta head -3000 $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta | tail #Checking the bed file head $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed # IGV igv & #################################################################################### ################ SNV Calling #################################################################################### ###### GATK # Run program and see options java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar # UnifiedGenotyper java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ -T UnifiedGenotyper # Executing SNVs calling of variants java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ -T UnifiedGenotyper\ -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ -L $HOME/mda13/calling/resources//Exon_50mb_hg19_chr20.bed\ -I $HOME/mda13/calling/mapping/test_final.bam\ -glm SNP \ -o snvs_raw_gatk.vcf # Checking file less snvs_raw_gatk.vcf # Counting lines du -hs snvs_raw_gatk.vcf wc -l snvs_raw_gatk.vcf # Labeling VCF files # VariantFiltration $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar \ -T VariantFiltration # Labeling SNVs VCF file java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ -T VariantFiltration\ -filter "QD < 2.0"\ -filterName "QD"\ -filter "MQ < 40.0"\ -filterName "MQ"\ -filter "FS > 60.0"\ -filterName "FS"\ -filter "HaplotypeScore > 13.0"\ -filterName "HaplotypeScore"\ -filter "MQRankSum < -12.5"\ -filterName "MQRankSum"\ -filter "ReadPosRankSum < -8.0"\ -filterName "ReadPosRankSum"\ -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ -V snvs_raw_gatk.vcf\ -o snvs_raw_gatk_labeled.vcf # Checking files wc -l snvs_raw_gatk.vcf wc -l snvs_raw_gatk_labeled.vcf grep PASS snvs_raw_gatk_labeled.vcf | wc -l ###### Samtools # Run program and see options $HOME/mda13/calling/software/samtools/samtools # UnifiedGenotyper $HOME/mda13/calling/software/samtools/samtools mpileup # Variant calling $HOME/mda13/calling/software/samtools/samtools mpileup\ -u -C 50 -l $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed\ -f $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ $HOME/mda13/calling/mapping/test_final.bam |\ $HOME/mda13/calling/software/samtools/bcftools/bcftools view -bvcg - >\ raw_samtools.bcf # Converting to VCF format of snvs $HOME/mda13/calling/software/samtools/bcftools/bcftools view\ raw_samtools.bcf |\ grep -v INDEL > snvs_raw_samtools.vcf # Filtering variants $HOME/mda13/calling/software/samtools/bcftools/vcfutils.pl varFilter\ snvs_raw_samtools.vcf >\ snvs_raw_samtools_filtered.vcf #Check positions: # Variants detected by GATK but not by Samtools 20:238507 20:590542 20:590543 20:865896 20:1291076 20:1517885 20:1895755 20:2413125 20:2539387 20:3025514 20:3657803 20:3657804 # Variants detected by Samtools but not by GATK 20:397928 20:944649 20:1209155 20:2380332 20:2380396 20:3650205 20:3734514 20:4680208 20:13060543 20:18123556 20:18511456 20:21142813 20:23016692 20:23729722 #################################################################################### ################ Indel Calling #################################################################################### ###### GATK # Executing indels calling of variants java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ -T UnifiedGenotyper \ -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ -L $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed\ -I $HOME/mda13/calling/mapping/test_final.bam\ -glm INDEL\ -o indels_raw_gatk.vcf # Checking file less indels_raw_gatk.vcf # Counting lines du -hs indels_raw_gatk.vcf wc -l indels_raw_gatk.vcf # Labeling indels VCF file java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ -T VariantFiltration\ -filter "QD < 2.0"\ -filterName "QD"\ -filter "FS > 200.0"\ -filterName "FS"\ -filter "ReadPosRankSum < -20.0"\ -filterName "ReadPosRankSum"\ -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ -V indels_raw_gatk.vcf\ -o indels_raw_gatk_labeled.vcf # Checking files wc -l indels_raw_gatk.vcf wc -l indels_raw_gatk_labeled.vcf grep PASS indels_raw_gatk_labeled.vcf | wc -l ############################## Samtools # Indel calling $HOME/mda13/calling/software/samtools/bcftools/bcftools view\ raw_samtools.bcf |\ grep INDEL > indels_raw_samtools.vcf # Filtering indels $HOME/mda13/calling/software/samtools/bcftools/vcfutils.pl varFilter\ indels_raw_samtools.vcf >\ indels_raw_samtools_filtered.vcf #Check positions: # Indels detected by GATK but not by Samtools 20:10622080 20:29632674 20:31292298 20:31384616 20:31427484 20:35695535 20:42249479 20:50008006 20:50015300 20:50713652 20:50768672 20:57828914 20:60294246 <---- homopolymer 20:61167632 <---- homopolymer # Indels detected by Samtools but not by GATK 20:944691 20:3052951 20:29449623 20:29623248 20:36149747 20:40074338 20:57828924 20:60737994 ############################# Dindel mkdir dindel_output # Stage 1 $HOME/mda13/calling/software/dindel/dindel --analysis getCIGARindels\ --bamFile $HOME/mda13/calling/mapping/test_final.bam\ --outputFile dindel_output/test_dindel\ --ref $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta # Stage 2 $HOME/mda13/calling/software/dindel/makeWindows.py\ --inputVarFile dindel_output/test_dindel.variants.txt\ --windowFilePrefix dindel_output/test_realign_windows\ --numWindowsPerFile 1000 # Stage 3 for varfile in dindel_output/*_realign_windows.*.txt do number=`basename $varfile .txt | sed 's/^.*_realign_windows.//'` $HOME/mda13/calling/software/dindel/dindel --analysis indels --doDiploid\ --bamFile $HOME/mda13/calling/mapping/test_final.bam\ --ref $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ --varFile $varfile\ --libFile dindel_output/test_dindel.libraries.txt\ --outputFile dindel_output/test_dindel_stage2_output_windows.$number done # Stage 4 ----------- Editing third line of mergeOutputDiploid.py script with utils path # Set path to $HOME/mda13/calling/software/dindel/utils ls dindel_output/*.glf.txt > dindel_output/test_glfFiles.txt $HOME/mda13/calling/software/dindel/mergeOutputDiploid.py\ -i dindel_output/test_glfFiles.txt\ -o indels_raw_dindel.vcf\ -s test\ -r $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta # Stage 5 # Select PASS indels grep PASS indels_raw_dindel.vcf > indels_raw_dindel_filtered.vcf # Indels detected only by dindel 20:271135 20:334317 20:865664 20:2291164 20:2411731 20:2797764 20:3212222 20:3674821 <----- homopolymer 20:3838205 20:3928844 <----- homopolymer 20:4768475 20:4778799 20:5086720 <----- homopolymer 20:5556686 20:5948169 20:5948421 20:29633966 <----- homopolymer 20:29647107 20:30354257 20:31291919 20:31292184