COURSE OVERVIEW
LOGISTICS
LAST COURSE PHOTOS
This shows you the differences between two versions of the page.
program:variant_handson [2013/03/13 08:23] jjimenez |
program:variant_handson [2017/05/24 14:26] (current) |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Variant calling and annotation: Practical session ====== | ||
+ | \\ | ||
+ | |||
+ | <code bash> | ||
+ | #!/bin/bash | ||
+ | |||
+ | # Working directory | ||
+ | |||
+ | cd $HOME/mda13/calling | ||
+ | |||
+ | # Checking the reference | ||
+ | |||
+ | head $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta | ||
+ | |||
+ | head -3000 $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta | tail | ||
+ | |||
+ | #Checking the bed file | ||
+ | |||
+ | head $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed | ||
+ | |||
+ | |||
+ | # IGV | ||
+ | igv & | ||
+ | |||
+ | #################################################################################### | ||
+ | |||
+ | ################ SNV Calling | ||
+ | |||
+ | #################################################################################### | ||
+ | |||
+ | ###### GATK | ||
+ | |||
+ | # Run program and see options | ||
+ | java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar | ||
+ | |||
+ | # UnifiedGenotyper | ||
+ | |||
+ | java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ | ||
+ | -T UnifiedGenotyper | ||
+ | |||
+ | |||
+ | # Executing SNVs calling of variants | ||
+ | java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ | ||
+ | -T UnifiedGenotyper\ | ||
+ | -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ | ||
+ | -L $HOME/mda13/calling/resources//Exon_50mb_hg19_chr20.bed\ | ||
+ | -I $HOME/mda13/calling/mapping/test_final.bam\ | ||
+ | -glm SNP \ | ||
+ | -o snvs_raw_gatk.vcf | ||
+ | |||
+ | |||
+ | # Checking file | ||
+ | less snvs_raw_gatk.vcf | ||
+ | |||
+ | # Counting lines | ||
+ | du -hs snvs_raw_gatk.vcf | ||
+ | wc -l snvs_raw_gatk.vcf | ||
+ | |||
+ | |||
+ | # Labeling VCF files | ||
+ | # VariantFiltration | ||
+ | $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar \ | ||
+ | -T VariantFiltration | ||
+ | |||
+ | # Labeling SNVs VCF file | ||
+ | java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ | ||
+ | -T VariantFiltration\ | ||
+ | -filter "QD < 2.0"\ | ||
+ | -filterName "QD"\ | ||
+ | -filter "MQ < 40.0"\ | ||
+ | -filterName "MQ"\ | ||
+ | -filter "FS > 60.0"\ | ||
+ | -filterName "FS"\ | ||
+ | -filter "HaplotypeScore > 13.0"\ | ||
+ | -filterName "HaplotypeScore"\ | ||
+ | -filter "MQRankSum < -12.5"\ | ||
+ | -filterName "MQRankSum"\ | ||
+ | -filter "ReadPosRankSum < -8.0"\ | ||
+ | -filterName "ReadPosRankSum"\ | ||
+ | -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ | ||
+ | -V snvs_raw_gatk.vcf\ | ||
+ | -o snvs_raw_gatk_labeled.vcf | ||
+ | |||
+ | # Checking files | ||
+ | wc -l snvs_raw_gatk.vcf | ||
+ | wc -l snvs_raw_gatk_labeled.vcf | ||
+ | grep PASS snvs_raw_gatk_labeled.vcf | wc -l | ||
+ | |||
+ | |||
+ | |||
+ | ###### Samtools | ||
+ | |||
+ | # Run program and see options | ||
+ | $HOME/mda13/calling/software/samtools/samtools | ||
+ | |||
+ | # UnifiedGenotyper | ||
+ | |||
+ | $HOME/mda13/calling/software/samtools/samtools mpileup | ||
+ | |||
+ | # Variant calling | ||
+ | $HOME/mda13/calling/software/samtools/samtools mpileup\ | ||
+ | -u -C 50 -l $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed\ | ||
+ | -f $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ | ||
+ | $HOME/mda13/calling/mapping/test_final.bam |\ | ||
+ | $HOME/mda13/calling/software/samtools/bcftools/bcftools view -bvcg - >\ | ||
+ | raw_samtools.bcf | ||
+ | |||
+ | # Converting to VCF format of snvs | ||
+ | $HOME/mda13/calling/software/samtools/bcftools/bcftools view\ | ||
+ | raw_samtools.bcf |\ | ||
+ | grep -v INDEL > snvs_raw_samtools.vcf | ||
+ | |||
+ | # Filtering variants | ||
+ | $HOME/mda13/calling/software/samtools/bcftools/vcfutils.pl varFilter\ | ||
+ | snvs_raw_samtools.vcf >\ | ||
+ | snvs_raw_samtools_filtered.vcf | ||
+ | |||
+ | |||
+ | |||
+ | #Check positions: | ||
+ | # Variants detected by GATK but not by Samtools | ||
+ | 20:238507 | ||
+ | 20:590542 | ||
+ | 20:590543 | ||
+ | 20:865896 | ||
+ | 20:1291076 | ||
+ | 20:1517885 | ||
+ | 20:1895755 | ||
+ | 20:2413125 | ||
+ | 20:2539387 | ||
+ | 20:3025514 | ||
+ | 20:3657803 | ||
+ | 20:3657804 | ||
+ | |||
+ | |||
+ | # Variants detected by Samtools but not by GATK | ||
+ | 20:397928 | ||
+ | 20:944649 | ||
+ | 20:1209155 | ||
+ | 20:2380332 | ||
+ | 20:2380396 | ||
+ | 20:3650205 | ||
+ | 20:3734514 | ||
+ | 20:4680208 | ||
+ | 20:13060543 | ||
+ | 20:18123556 | ||
+ | 20:18511456 | ||
+ | 20:21142813 | ||
+ | 20:23016692 | ||
+ | 20:23729722 | ||
+ | |||
+ | |||
+ | |||
+ | #################################################################################### | ||
+ | |||
+ | ################ Indel Calling | ||
+ | |||
+ | #################################################################################### | ||
+ | |||
+ | ###### GATK | ||
+ | |||
+ | |||
+ | # Executing indels calling of variants | ||
+ | java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ | ||
+ | -T UnifiedGenotyper \ | ||
+ | -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ | ||
+ | -L $HOME/mda13/calling/resources/Exon_50mb_hg19_chr20.bed\ | ||
+ | -I $HOME/mda13/calling/mapping/test_final.bam\ | ||
+ | -glm INDEL\ | ||
+ | -o indels_raw_gatk.vcf | ||
+ | |||
+ | # Checking file | ||
+ | less indels_raw_gatk.vcf | ||
+ | |||
+ | # Counting lines | ||
+ | du -hs indels_raw_gatk.vcf | ||
+ | wc -l indels_raw_gatk.vcf | ||
+ | |||
+ | # Labeling indels VCF file | ||
+ | java -jar $HOME/mda13/calling/software/gatk/GenomeAnalysisTK.jar\ | ||
+ | -T VariantFiltration\ | ||
+ | -filter "QD < 2.0"\ | ||
+ | -filterName "QD"\ | ||
+ | -filter "FS > 200.0"\ | ||
+ | -filterName "FS"\ | ||
+ | -filter "ReadPosRankSum < -20.0"\ | ||
+ | -filterName "ReadPosRankSum"\ | ||
+ | -R $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ | ||
+ | -V indels_raw_gatk.vcf\ | ||
+ | -o indels_raw_gatk_labeled.vcf | ||
+ | |||
+ | # Checking files | ||
+ | wc -l indels_raw_gatk.vcf | ||
+ | wc -l indels_raw_gatk_labeled.vcf | ||
+ | grep PASS indels_raw_gatk_labeled.vcf | wc -l | ||
+ | |||
+ | |||
+ | ############################## Samtools | ||
+ | |||
+ | # Indel calling | ||
+ | |||
+ | $HOME/mda13/calling/software/samtools/bcftools/bcftools view\ | ||
+ | raw_samtools.bcf |\ | ||
+ | grep INDEL > indels_raw_samtools.vcf | ||
+ | |||
+ | # Filtering indels | ||
+ | $HOME/mda13/calling/software/samtools/bcftools/vcfutils.pl varFilter\ | ||
+ | indels_raw_samtools.vcf >\ | ||
+ | indels_raw_samtools_filtered.vcf | ||
+ | |||
+ | |||
+ | #Check positions: | ||
+ | # Indels detected by GATK but not by Samtools | ||
+ | 20:10622080 | ||
+ | 20:29632674 | ||
+ | 20:31292298 | ||
+ | 20:31384616 | ||
+ | 20:31427484 | ||
+ | 20:35695535 | ||
+ | 20:42249479 | ||
+ | 20:50008006 | ||
+ | 20:50015300 | ||
+ | 20:50713652 | ||
+ | 20:50768672 | ||
+ | 20:57828914 | ||
+ | 20:60294246 <---- homopolymer | ||
+ | 20:61167632 <---- homopolymer | ||
+ | |||
+ | |||
+ | |||
+ | # Indels detected by Samtools but not by GATK | ||
+ | 20:944691 | ||
+ | 20:3052951 | ||
+ | 20:29449623 | ||
+ | 20:29623248 | ||
+ | 20:36149747 | ||
+ | 20:40074338 | ||
+ | 20:57828924 | ||
+ | 20:60737994 | ||
+ | |||
+ | ############################# Dindel | ||
+ | |||
+ | mkdir dindel_output | ||
+ | |||
+ | # Stage 1 | ||
+ | |||
+ | $HOME/mda13/calling/software/dindel/dindel --analysis getCIGARindels\ | ||
+ | --bamFile $HOME/mda13/calling/mapping/test_final.bam\ | ||
+ | --outputFile dindel_output/test_dindel\ | ||
+ | --ref $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta | ||
+ | |||
+ | # Stage 2 | ||
+ | $HOME/mda13/calling/software/dindel/makeWindows.py\ | ||
+ | --inputVarFile dindel_output/test_dindel.variants.txt\ | ||
+ | --windowFilePrefix dindel_output/test_realign_windows\ | ||
+ | --numWindowsPerFile 1000 | ||
+ | |||
+ | # Stage 3 | ||
+ | for varfile in dindel_output/*_realign_windows.*.txt | ||
+ | do number=`basename $varfile .txt | sed 's/^.*_realign_windows.//'` | ||
+ | |||
+ | $HOME/mda13/calling/software/dindel/dindel --analysis indels --doDiploid\ | ||
+ | --bamFile $HOME/mda13/calling/mapping/test_final.bam\ | ||
+ | --ref $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta\ | ||
+ | --varFile $varfile\ | ||
+ | --libFile dindel_output/test_dindel.libraries.txt\ | ||
+ | --outputFile dindel_output/test_dindel_stage2_output_windows.$number | ||
+ | done | ||
+ | |||
+ | # Stage 4 ----------- Editing third line of mergeOutputDiploid.py script with utils path | ||
+ | # Set path to $HOME/mda13/calling/software/dindel/utils | ||
+ | |||
+ | ls dindel_output/*.glf.txt > dindel_output/test_glfFiles.txt | ||
+ | |||
+ | $HOME/mda13/calling/software/dindel/mergeOutputDiploid.py\ | ||
+ | -i dindel_output/test_glfFiles.txt\ | ||
+ | -o indels_raw_dindel.vcf\ | ||
+ | -s test\ | ||
+ | -r $HOME/mda13/calling/resources/human_g1k_v37.chr20.fasta | ||
+ | |||
+ | |||
+ | # Stage 5 | ||
+ | # Select PASS indels | ||
+ | grep PASS indels_raw_dindel.vcf > indels_raw_dindel_filtered.vcf | ||
+ | |||
+ | # Indels detected only by dindel | ||
+ | 20:271135 | ||
+ | 20:334317 | ||
+ | 20:865664 | ||
+ | 20:2291164 | ||
+ | 20:2411731 | ||
+ | 20:2797764 | ||
+ | 20:3212222 | ||
+ | 20:3674821 <----- homopolymer | ||
+ | 20:3838205 | ||
+ | 20:3928844 <----- homopolymer | ||
+ | 20:4768475 | ||
+ | 20:4778799 | ||
+ | 20:5086720 <----- homopolymer | ||
+ | 20:5556686 | ||
+ | 20:5948169 | ||
+ | 20:5948421 | ||
+ | 20:29633966 <----- homopolymer | ||
+ | 20:29647107 | ||
+ | 20:30354257 | ||
+ | 20:31291919 | ||
+ | 20:31292184 | ||
+ | |||
+ | |||
+ | </code> |