Snps_pipeline

#Zimpel et al., 2020. "Global distribution and evolution of Mycobacterium bovis lineages".
#Cristina Kraemer Zimpel and Robson Francisco de Souza

#Trimmomatic
#input: fastq files
for i in `ls -1 *1*.fastq.gz | sed 's/\_1.fastq.gz//'`; do trimmomatic PE -phred33 $i\_1.fastq.gz $i\_2.fastq.gz $i\_1_paired.fq.gz $i\_1_unpaired.fq.gz $i\_2_paired.fq.gz $i\_2_unpaired.fq.gz ILLUMINACLIP:NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:5:20 MINLEN:36; done

#FASTQC
fastqc *paired.fq.gz -o output_folder

#BWA mem; 
#reference genome AF2122.fasta
#index construction
bwa index reference_genome.fasta

#map to sorted bam
\ls *.fq.gz | cut -f 1 -d _ | sort -u | parallel -N1 -j5 'bwa mem -t 4 AF2122.fasta {}_1_paired.fq.gz {}_2_paired.fq.gz | samtools sort -o {}.bam; pileup.sh in={}.bam > coverage/{}.pu.out 2> coverage/{}.pu.err.txt; samtools coverage {}.bam > coverage/{}.st.out 2> coverage/{}.st.err'

#check depth in RDs positions (developed by Robson Francisco de Souza):
\ls *.bam | parallel --citation -N1 -j5 "samtools depth {} | awk '\$2 >= #POSITION1NITIAL# && \$2 <= #POSITION2FINAL#{a+=\$3;n++}END{print \"{.}\t\",a/n}'"

#Parallel citation:
#O. Tange (2018): GNU Parallel 2018, Mar 2018, ISBN 9781387509881, DOI https://doi.org/10.5281/zenodo.11460


#Picard
for i in `ls *.bam`:
    do picard MarkDuplicates REMOVE_DUPLICATES=true INPUT=$i OUTPUT=$i.dupl.bam METRICS_FILE=$i.txt
done

#check number of reads
#samtools view input.bam.dupl.bam | wc -l

#Samtools mpileup
#-q = min mapping quality; -Q = min base quality
for i in $(ls *.dupl.bam);do samtools mpileup -f AF2122.fasta -q 20 -Q 20 "$i" > "$i".mpileup ; done

#VarScan mpileup2cns
for i in `ls *.mpileup`:; do java -jar varscan mpileup2cns  ${i} --min-coverage 7 --min-var-freq 0.1 --min-freq-for-hom 0.90 --variants --output-vcf 1 > ${i}.varscan.vcf ; done

#options:
java -jar snpEff.jar databases | grep "your search term for your organism"
#Snps annotation - snpEff
#check Chromosome name
awk '{print$1}' myfile.vcf | tail -n 1
#NC_002945.4

#replace Chromosome name
for f in $(ls *.vcf)
do
    sed -i 's/NC_002945.4/Chromosome/g' "$f" > "$i".vcf
        echo "Processing $f"
done


#snpEff
for f in $(ls *.vcf):; do java -Xmx4g -jar ~/snpEff/snpEff.jar -v -no-downstream -no-upstream Mycobacterium_bovis_af2122_97 "${f}" > "${f}".annot.vcf ; done

##################

#remove genes
for i in *.vcf; do awk -F'\t' '! ($8 ~ /13E12|PE_PGRS|Transposase|Integrase|integrase|transposase|PPE|Resolvase|resolvase|Phage|phage|Maturase|PE family|PE|pe1|pe2|p3|pe4|pe5|pe6|pe7|pe8|pe9|CONSERVED 13E12/) {print}' $i > $i.vcf ; done

#Remove lines with INDELS in the columns 4 and 5
for i in *.vcf.vcf; do awk 'length($4)<2 {print}' $i > $i.vcf; done

for i in *.vcf.vcf.vcf; do awk 'length($5)<2 {print}' $i > $i.txt; done

#SNPs - homogeneous
for i in *.vcf.vcf.vcf.txt; do awk -F'\t' '! ($8 ~ /HET=1/) {print}' $i > $i.txt; done


#Commands below are to prepare the files to run the snp matrix pipeline in python
#add new column (1st column, filled with 1)

for i in *.txt; do awk '{$0="1\t"$0}1' $i > $i.txt; done

#remmove column 4
for i in *.txt; do awk -i inplace '{$0=gensub(/\s*\S+/,"",4)}1' $i

#Remove vcf header
for f in *.txt ; do sed -i '' '/#/d' $f ; done