-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSnps_pipeline
84 lines (57 loc) · 3.17 KB
/
Snps_pipeline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#Zimpel et al., 2020. "Global distribution and evolution of Mycobacterium bovis lineages".
#Cristina Kraemer Zimpel and Robson Francisco de Souza
#Trimmomatic
#input: fastq files
for i in `ls -1 *1*.fastq.gz | sed 's/\_1.fastq.gz//'`; do trimmomatic PE -phred33 $i\_1.fastq.gz $i\_2.fastq.gz $i\_1_paired.fq.gz $i\_1_unpaired.fq.gz $i\_2_paired.fq.gz $i\_2_unpaired.fq.gz ILLUMINACLIP:NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:5:20 MINLEN:36; done
#FASTQC
fastqc *paired.fq.gz -o output_folder
#BWA mem;
#reference genome AF2122.fasta
#index construction
bwa index reference_genome.fasta
#map to sorted bam
\ls *.fq.gz | cut -f 1 -d _ | sort -u | parallel -N1 -j5 'bwa mem -t 4 AF2122.fasta {}_1_paired.fq.gz {}_2_paired.fq.gz | samtools sort -o {}.bam; pileup.sh in={}.bam > coverage/{}.pu.out 2> coverage/{}.pu.err.txt; samtools coverage {}.bam > coverage/{}.st.out 2> coverage/{}.st.err'
#check depth in RDs positions (developed by Robson Francisco de Souza):
\ls *.bam | parallel --citation -N1 -j5 "samtools depth {} | awk '\$2 >= #POSITION1NITIAL# && \$2 <= #POSITION2FINAL#{a+=\$3;n++}END{print \"{.}\t\",a/n}'"
#Parallel citation:
#O. Tange (2018): GNU Parallel 2018, Mar 2018, ISBN 9781387509881, DOI https://doi.org/10.5281/zenodo.11460
#Picard
for i in `ls *.bam`:
do picard MarkDuplicates REMOVE_DUPLICATES=true INPUT=$i OUTPUT=$i.dupl.bam METRICS_FILE=$i.txt
done
#check number of reads
#samtools view input.bam.dupl.bam | wc -l
#Samtools mpileup
#-q = min mapping quality; -Q = min base quality
for i in $(ls *.dupl.bam);do samtools mpileup -f AF2122.fasta -q 20 -Q 20 "$i" > "$i".mpileup ; done
#VarScan mpileup2cns
for i in `ls *.mpileup`:; do java -jar varscan mpileup2cns ${i} --min-coverage 7 --min-var-freq 0.1 --min-freq-for-hom 0.90 --variants --output-vcf 1 > ${i}.varscan.vcf ; done
#options:
java -jar snpEff.jar databases | grep "your search term for your organism"
#Snps annotation - snpEff
#check Chromosome name
awk '{print$1}' myfile.vcf | tail -n 1
#NC_002945.4
#replace Chromosome name
for f in $(ls *.vcf)
do
sed -i 's/NC_002945.4/Chromosome/g' "$f" > "$i".vcf
echo "Processing $f"
done
#snpEff
for f in $(ls *.vcf):; do java -Xmx4g -jar ~/snpEff/snpEff.jar -v -no-downstream -no-upstream Mycobacterium_bovis_af2122_97 "${f}" > "${f}".annot.vcf ; done
##################
#remove genes
for i in *.vcf; do awk -F'\t' '! ($8 ~ /13E12|PE_PGRS|Transposase|Integrase|integrase|transposase|PPE|Resolvase|resolvase|Phage|phage|Maturase|PE family|PE|pe1|pe2|p3|pe4|pe5|pe6|pe7|pe8|pe9|CONSERVED 13E12/) {print}' $i > $i.vcf ; done
#Remove lines with INDELS in the columns 4 and 5
for i in *.vcf.vcf; do awk 'length($4)<2 {print}' $i > $i.vcf; done
for i in *.vcf.vcf.vcf; do awk 'length($5)<2 {print}' $i > $i.txt; done
#SNPs - homogeneous
for i in *.vcf.vcf.vcf.txt; do awk -F'\t' '! ($8 ~ /HET=1/) {print}' $i > $i.txt; done
#Commands below are to prepare the files to run the snp matrix pipeline in python
#add new column (1st column, filled with 1)
for i in *.txt; do awk '{$0="1\t"$0}1' $i > $i.txt; done
#remmove column 4
for i in *.txt; do awk -i inplace '{$0=gensub(/\s*\S+/,"",4)}1' $i
#Remove vcf header
for f in *.txt ; do sed -i '' '/#/d' $f ; done