DAMIAN Post-processing for Flavivirus and FSME

gene_x 0 like s 24 view s

Tags: pipeline

Prepare input raw data

~/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME

ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R1_001.fastq.gz p20534_7448_R1.fastq.gz
ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R2_001.fastq.gz p20534_7448_R2.fastq.gz

Prepare virus database and select 8 representatives for the eight given viruses from the database

# -- Download genomes --
# ---- Date is 13.06.2025. ----
#Taxonomy ID: 3044782
#Die Gattung Orthoflavivirus (früher Flavivirus) umfasst behüllte Viren mit einem positivsträngigen RNA-Einzelstrang als Genom, die durch Arthropoden (Zecken und Stechmücken) als Vektoren auf Vögel und Säugetiere übertragen werden.
#The English name for Flavivirus is simply: Flavivirus
#It is both the scientific and common name for the genus of viruses in the family Flaviviridae. This genus includes several well-known viruses such as:
        * Dengue virus
        * Zika virus
        * West Nile virus
        * Yellow fever virus
        * Tick-borne encephalitis virus (TBEV / FSME virus)

esearch -db nucleotide -query "txid3044782[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_3044782_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_3044782_ncbi.fasta complete_genome_3044782_ncbi.fasta  #96579-->9431
#https://www.ebi.ac.uk/ena/browser/view/3044782

#Download FMSE
esearch -db nucleotide -query "txid11084[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_11084_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_11084_ncbi.fasta complete_genome_11084_ncbi.fasta  #3426-->219
#https://www.ebi.ac.uk/ena/browser/view/11084

samtools faidx complete_genome_11084_ncbi.fasta PV626569.1 > PV626569.fasta

Run the second round of vrap (--host==${virus}.fasta)

#cat FluB_PB1.fasta FluB_PB2.fasta FluB_PA.fasta FluB_HA.fasta FluB_NP.fasta FluB_NB_NA.fasta FluB_M1_BM2.fasta FluB_NEP_NS1.fasta > FluB.fasta

# Run vrap (second round): selecte some representative viruses from the generated Excel-files generated by the last step as --host
(vrap) for sample in p20534_7448; do
    vrap/vrap_until_bowtie2.py  -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz  -o vrap_${sample}_on_FSME --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/PV626569.fasta   -t 100 -l 200  --gbt2 --noblast
done

(vrap) for sample in p20534_7448; do
    vrap/vrap_until_bowtie2.py  -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz  -o vrap_${sample}_on_Flavivirus --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/complete_genome_3044782_ncbi.fasta   -t 100 -l 200  --gbt2 --noblast
done

Generate the mapping statistics for the sam-files generated from last step

for sample in p20534_7448; do
    echo "-----${sample}_on_representatives------" >> LOG_mapping
    #cd vrap_${sample}_on_${virus}/bowtie
    cd vrap_${sample}_on_Flavivirus/bowtie
    # Rename and convert SAM to BAM
    mv mapped mapped.sam 2>> ../../LOG_mapping
    samtools view -S -b mapped.sam > mapped.bam 2>> ../../LOG_mapping
    samtools sort mapped.bam -o mapped_sorted.bam 2>> ../../LOG_mapping
    samtools index mapped_sorted.bam 2>> ../../LOG_mapping
    # Write flagstat output to log (go up two levels to write correctly)
    samtools flagstat mapped_sorted.bam >> ../../LOG_mapping 2>&1
    #samtools idxstats mapped_sorted.bam >> ../../LOG_mapping 2>&1
    cd ../..
done

(bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_FSME/bowtie$ samtools flagstat mapped_sorted.bam
7836046 + 0 in total (QC-passed reads + QC-failed reads)
7836046 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
0 + 0 mapped (0.00% : N/A)
0 + 0 primary mapped (0.00% : N/A)
5539082 + 0 paired in sequencing
2769541 + 0 read1
2769541 + 0 read2
0 + 0 properly paired (0.00% : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

(bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_Flavivirus/bowtie$ samtools flagstat mapped_sorted.bam
7836234 + 0 in total (QC-passed reads + QC-failed reads)
7836234 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
52 + 0 mapped (0.00% : N/A)
52 + 0 primary mapped (0.00% : N/A)
5539458 + 0 paired in sequencing
2769729 + 0 read1
2769729 + 0 read2
0 + 0 properly paired (0.00% : N/A)
4 + 0 with itself and mate mapped
13 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

samtools view -F 4 mapped_sorted.bam > mapped_reads.sam
awk '{print $3}' mapped_reads.sam | sort | uniq -c
52 KY766069.1 Zika virus isolate Pf13/251013-18, complete genome

# ------------------ DEBUG ----------------------
samtools idxstats mapped_sorted.bam | cut -f 1

for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
    echo "Reference: $ref"
    samtools view -b mapped_sorted.bam "$ref" | samtools flagstat -
done

When I run samtools flagstat mapped_sorted.bam

49572521 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
1169 + 0 mapped (0.00% : N/A)
38247374 + 0 paired in sequencing
19123687 + 0 read1
19123687 + 0 read2
884 + 0 properly paired (0.00% : N/A)
934 + 0 with itself and mate mapped
227 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

however, wenn I run for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
        echo "Reference: $ref"
        samtools view -b mapped_sorted.bam "$ref" | samtools flagstat -
        done

Reference: PV424647.1
83 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
72 + 0 mapped (86.75% : N/A)
82 + 0 paired in sequencing
41 + 0 read1
41 + 0 read2
56 + 0 properly paired (68.29% : N/A)
60 + 0 with itself and mate mapped
11 + 0 singletons (13.41% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

I want to also the same total name as "samtools flagstat mapped_sorted.bam". How?

samtools view -b mapped_sorted.bam PV424649.1

for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
echo "Reference: $ref"
samtools view -h mapped_sorted.bam | grep -E "^@|$ref" | samtools view -Sb - | samtools flagstat -
done

# ---- DEBUG END ----

#draw some plots for some representative isolates which found in the first round (see Excel-file).
samtools depth -m 0 -a mapped_sorted.bam > coverage.txt
#grep "PV424649.1" coverage.txt > FluB_PB1_coverage.txt
#grep "PV424650.1" coverage.txt > FluB_PB2_coverage.txt
#grep "PV424648.1" coverage.txt > FluB_PA_coverage.txt
#grep "PV424643.1" coverage.txt > FluB_HA_coverage.txt
#grep "PV424646.1" coverage.txt > FluB_NP_coverage.txt
#grep "PV424645.1" coverage.txt > FluB_NB_NA_coverage.txt
#grep "PV424644.1" coverage.txt > FluB_M1_BM2_coverage.txt
#grep "PV424647.1" coverage.txt > FluB_NEP_NS1_coverage.txt

        import pandas as pd
        import matplotlib.pyplot as plt

        # Load coverage data
        df = pd.read_csv("coverage.txt", sep="\t", header=None, names=["chr", "pos", "coverage"])

        # Plot
        plt.figure(figsize=(10,4))
        plt.plot(df["pos"], df["coverage"], color="blue", linewidth=0.5)
        plt.xlabel("Genomic Position")
        plt.ylabel("Coverage Depth")
        plt.title("BAM Coverage Plot")
        plt.show()

Report

Subject: Mapping Results for FluB Representative Isolate

I have re-analyzed sample P20534 (7448) with a focus on Flaviviruses and FSME.

Using curated reference sets from NCBI (Taxonomy ID 3044782 for Flavivirus, comprising 9,431 complete genomes—see attached flavivirus_names.txt for details; and Taxonomy ID 11084 for FSME, with 219 complete genomes), I performed targeted mapping. The key findings are summarized below:

    * Total reads: 7,836,234
    * Mapped to Flavivirus: 52 reads
      All 52 reads mapped specifically to Zika virus (KY766069.1, complete genome of isolate Pf13/251013-18)
    * Mapped to FSME: No significant hits detected

Please find attached a coverage plot for the Zika virus genome (KY766069).

Preparing a database containing all representative viruses from NCBI Virus #https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/ #Download All Records (18,708) am 26.05.2025

# ------------ Manually update the internal viral databases --------------
##https://www.ebi.ac.uk/ena/browser/view/10239
#esearch -db nucleotide -query "txid10239[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10239_ncbi.fasta
#esearch -db protein -query "txid11520[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > protein_11520_ncbi.fasta
#mv ~/Tools/vrap/database/viral_db/nucleotide.fa ~/Tools/vrap/database/viral_db/nucleotide_Human_alphaherpesvirus_1.fa
#mv ~/Tools/vrap/database/viral_db/protein.fa ~/Tools/vrap/database/viral_db/protein_Human_alphaherpesvirus_1.fa
#cp genome_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/nucleotide.fa
#cp protein_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/protein.fa
#cd ~/Tools/vrap/database/viral_db
#~/Tools/vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide
#~/Tools/vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein
#vrap/vrap_noassembly.py  -1 AW005486_R1.fastq.gz -2 AW005486_R2.fastq.gz -o vrap_AW005486_on_InfluB  --bt2idx=/home/jhuang/REFs/genome  --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/complete_11520_ncbi.fasta  -t 20 -l 200  -g

# ----------- Three databases ----------
#db is [virus_user_db]
/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log 2>> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log

#db is ~/Tools/vrap/database/viral_db/nucleotide.fa  [Human alphaherpesvirus 1] [virus_nt_db]
/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
Warning: [blastn] Examining 5 or more matches is recommended

#db is ~/Tools/vrap/database/viral_db/protein.fa [Human alphaherpesvirus 1] [virus_aa_db]
/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastx.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

DAMIAN Post-processing for Flavivirus and FSME

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章