gene_x 0 like s 24 view s
Tags: pipeline
Prepare input raw data
~/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME
ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R1_001.fastq.gz p20534_7448_R1.fastq.gz
ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R2_001.fastq.gz p20534_7448_R2.fastq.gz
Prepare virus database and select 8 representatives for the eight given viruses from the database
# -- Download genomes --
# ---- Date is 13.06.2025. ----
#Taxonomy ID: 3044782
#Die Gattung Orthoflavivirus (früher Flavivirus) umfasst behüllte Viren mit einem positivsträngigen RNA-Einzelstrang als Genom, die durch Arthropoden (Zecken und Stechmücken) als Vektoren auf Vögel und Säugetiere übertragen werden.
#The English name for Flavivirus is simply: Flavivirus
#It is both the scientific and common name for the genus of viruses in the family Flaviviridae. This genus includes several well-known viruses such as:
* Dengue virus
* Zika virus
* West Nile virus
* Yellow fever virus
* Tick-borne encephalitis virus (TBEV / FSME virus)
esearch -db nucleotide -query "txid3044782[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_3044782_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_3044782_ncbi.fasta complete_genome_3044782_ncbi.fasta #96579-->9431
#https://www.ebi.ac.uk/ena/browser/view/3044782
#Download FMSE
esearch -db nucleotide -query "txid11084[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_11084_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_11084_ncbi.fasta complete_genome_11084_ncbi.fasta #3426-->219
#https://www.ebi.ac.uk/ena/browser/view/11084
samtools faidx complete_genome_11084_ncbi.fasta PV626569.1 > PV626569.fasta
Run the second round of vrap (--host==${virus}.fasta)
#cat FluB_PB1.fasta FluB_PB2.fasta FluB_PA.fasta FluB_HA.fasta FluB_NP.fasta FluB_NB_NA.fasta FluB_M1_BM2.fasta FluB_NEP_NS1.fasta > FluB.fasta
# Run vrap (second round): selecte some representative viruses from the generated Excel-files generated by the last step as --host
(vrap) for sample in p20534_7448; do
vrap/vrap_until_bowtie2.py -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz -o vrap_${sample}_on_FSME --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/PV626569.fasta -t 100 -l 200 --gbt2 --noblast
done
(vrap) for sample in p20534_7448; do
vrap/vrap_until_bowtie2.py -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz -o vrap_${sample}_on_Flavivirus --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/complete_genome_3044782_ncbi.fasta -t 100 -l 200 --gbt2 --noblast
done
Generate the mapping statistics for the sam-files generated from last step
for sample in p20534_7448; do
echo "-----${sample}_on_representatives------" >> LOG_mapping
#cd vrap_${sample}_on_${virus}/bowtie
cd vrap_${sample}_on_Flavivirus/bowtie
# Rename and convert SAM to BAM
mv mapped mapped.sam 2>> ../../LOG_mapping
samtools view -S -b mapped.sam > mapped.bam 2>> ../../LOG_mapping
samtools sort mapped.bam -o mapped_sorted.bam 2>> ../../LOG_mapping
samtools index mapped_sorted.bam 2>> ../../LOG_mapping
# Write flagstat output to log (go up two levels to write correctly)
samtools flagstat mapped_sorted.bam >> ../../LOG_mapping 2>&1
#samtools idxstats mapped_sorted.bam >> ../../LOG_mapping 2>&1
cd ../..
done
(bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_FSME/bowtie$ samtools flagstat mapped_sorted.bam
7836046 + 0 in total (QC-passed reads + QC-failed reads)
7836046 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
0 + 0 mapped (0.00% : N/A)
0 + 0 primary mapped (0.00% : N/A)
5539082 + 0 paired in sequencing
2769541 + 0 read1
2769541 + 0 read2
0 + 0 properly paired (0.00% : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
(bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_Flavivirus/bowtie$ samtools flagstat mapped_sorted.bam
7836234 + 0 in total (QC-passed reads + QC-failed reads)
7836234 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
52 + 0 mapped (0.00% : N/A)
52 + 0 primary mapped (0.00% : N/A)
5539458 + 0 paired in sequencing
2769729 + 0 read1
2769729 + 0 read2
0 + 0 properly paired (0.00% : N/A)
4 + 0 with itself and mate mapped
13 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
samtools view -F 4 mapped_sorted.bam > mapped_reads.sam
awk '{print $3}' mapped_reads.sam | sort | uniq -c
52 KY766069.1 Zika virus isolate Pf13/251013-18, complete genome
# ------------------ DEBUG ----------------------
samtools idxstats mapped_sorted.bam | cut -f 1
for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
echo "Reference: $ref"
samtools view -b mapped_sorted.bam "$ref" | samtools flagstat -
done
When I run samtools flagstat mapped_sorted.bam
49572521 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
1169 + 0 mapped (0.00% : N/A)
38247374 + 0 paired in sequencing
19123687 + 0 read1
19123687 + 0 read2
884 + 0 properly paired (0.00% : N/A)
934 + 0 with itself and mate mapped
227 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
however, wenn I run for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
echo "Reference: $ref"
samtools view -b mapped_sorted.bam "$ref" | samtools flagstat -
done
Reference: PV424647.1
83 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
72 + 0 mapped (86.75% : N/A)
82 + 0 paired in sequencing
41 + 0 read1
41 + 0 read2
56 + 0 properly paired (68.29% : N/A)
60 + 0 with itself and mate mapped
11 + 0 singletons (13.41% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
I want to also the same total name as "samtools flagstat mapped_sorted.bam". How?
samtools view -b mapped_sorted.bam PV424649.1
for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
echo "Reference: $ref"
samtools view -h mapped_sorted.bam | grep -E "^@|$ref" | samtools view -Sb - | samtools flagstat -
done
# ---- DEBUG END ----
#draw some plots for some representative isolates which found in the first round (see Excel-file).
samtools depth -m 0 -a mapped_sorted.bam > coverage.txt
#grep "PV424649.1" coverage.txt > FluB_PB1_coverage.txt
#grep "PV424650.1" coverage.txt > FluB_PB2_coverage.txt
#grep "PV424648.1" coverage.txt > FluB_PA_coverage.txt
#grep "PV424643.1" coverage.txt > FluB_HA_coverage.txt
#grep "PV424646.1" coverage.txt > FluB_NP_coverage.txt
#grep "PV424645.1" coverage.txt > FluB_NB_NA_coverage.txt
#grep "PV424644.1" coverage.txt > FluB_M1_BM2_coverage.txt
#grep "PV424647.1" coverage.txt > FluB_NEP_NS1_coverage.txt
import pandas as pd
import matplotlib.pyplot as plt
# Load coverage data
df = pd.read_csv("coverage.txt", sep="\t", header=None, names=["chr", "pos", "coverage"])
# Plot
plt.figure(figsize=(10,4))
plt.plot(df["pos"], df["coverage"], color="blue", linewidth=0.5)
plt.xlabel("Genomic Position")
plt.ylabel("Coverage Depth")
plt.title("BAM Coverage Plot")
plt.show()
Report
Subject: Mapping Results for FluB Representative Isolate
I have re-analyzed sample P20534 (7448) with a focus on Flaviviruses and FSME.
Using curated reference sets from NCBI (Taxonomy ID 3044782 for Flavivirus, comprising 9,431 complete genomes—see attached flavivirus_names.txt for details; and Taxonomy ID 11084 for FSME, with 219 complete genomes), I performed targeted mapping. The key findings are summarized below:
* Total reads: 7,836,234
* Mapped to Flavivirus: 52 reads
All 52 reads mapped specifically to Zika virus (KY766069.1, complete genome of isolate Pf13/251013-18)
* Mapped to FSME: No significant hits detected
Please find attached a coverage plot for the Zika virus genome (KY766069).
Preparing a database containing all representative viruses from NCBI Virus #https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/ #Download All Records (18,708) am 26.05.2025
# ------------ Manually update the internal viral databases --------------
##https://www.ebi.ac.uk/ena/browser/view/10239
#esearch -db nucleotide -query "txid10239[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10239_ncbi.fasta
#esearch -db protein -query "txid11520[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > protein_11520_ncbi.fasta
#mv ~/Tools/vrap/database/viral_db/nucleotide.fa ~/Tools/vrap/database/viral_db/nucleotide_Human_alphaherpesvirus_1.fa
#mv ~/Tools/vrap/database/viral_db/protein.fa ~/Tools/vrap/database/viral_db/protein_Human_alphaherpesvirus_1.fa
#cp genome_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/nucleotide.fa
#cp protein_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/protein.fa
#cd ~/Tools/vrap/database/viral_db
#~/Tools/vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide
#~/Tools/vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein
#vrap/vrap_noassembly.py -1 AW005486_R1.fastq.gz -2 AW005486_R2.fastq.gz -o vrap_AW005486_on_InfluB --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/complete_11520_ncbi.fasta -t 20 -l 200 -g
# ----------- Three databases ----------
#db is [virus_user_db]
/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log 2>> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
#db is ~/Tools/vrap/database/viral_db/nucleotide.fa [Human alphaherpesvirus 1] [virus_nt_db]
/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
Warning: [blastn] Examining 5 or more matches is recommended
#db is ~/Tools/vrap/database/viral_db/protein.fa [Human alphaherpesvirus 1] [virus_aa_db]
/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein" -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastx.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
点赞本文的读者
还没有人对此文章表态
没有评论
How to correlate RNA-seq Data with Mass Spectrometry Proteomics Data?
Workflow using QIIME2 for Data_Marius_16S_2025
Variant calling for Data_Pietschmann_229ECoronavirus_Mutations_2025 (via docker own_viral_ngs)
© 2023 XGenes.com Impressum