Analysis of the RNA binding protein motifs for RNA Seq

gene_x 0 like s 152 view s

Tags: pipeline

Analysis of the RNA binding protein motifs for RNA Seq

TODOs_NEXTWEEK: how to use RBPmap results to enrich the significant RBPs for RNA-seq (3' UTRs?) and miRNAs?

  1. 🧬 1. RBPmap (Web + Command-line)
  2. Type: Web tool and downloadable Perl scripts
  3. Purpose: Scans RNA sequences for known RBP motifs
  4. Database: Uses curated RBP motif sets (e.g., from literature, CLIP data)
  5. URL: https://rbpmap.technion.ac.il/
  6. Good for: Biologists without strong R background
  7. 🚫 Not a package, but scriptable via CLI or HTML results
  1. filtering out RBP from the database

  2. improve the output of the 3utr_down_cleaned.fasta

  3. multiple testing

  4. perform this for miRNA.

http://xgenes.com/article/article-content/146/peak-and-motif-analyses-in-promoters/

http://xgenes.com/article/article-content/154/density-of-motif-plots-and-its-statistical-tests/

There are several alternative R packages and tools to perform motif enrichment analysis for RNA-binding proteins (RBPs), beyond PWMEnrich::motifEnrichment(). Here are the most notable ones:

| Tool / Package | Enrichment | Custom Motifs | CLI or R? | RNA-specific? |

| ------------------------ | ----------------- | --------------- | --------- | -------------- |

| PWMEnrich | ✅ | ✅ | R | ✅ |

| RBPmap | ✅ | ❌ (uses own db) | Web/CLI | ✅ | ----> try RBPmap_results + enrichments!

| Biostrings/TFBSTools | ❌ (only scanning) | ✅ | R | ❌ | #ATtRACT + Biostrings / TFBSTools

| rmap | ✅ (CLIP-based) | ❌ | R | ✅ |

| Homer | ✅ | ✅ | CLI | ⚠ RNA optional |

| MEME (AME, FIMO) | ✅ | ✅ | Web/CLI | ⚠ Generic |

  1. Using R to get 3UTR_sequences.fasta, 5UTR

    1. setwd("~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis")
    2. # === STEP 0: Load libraries ===
    3. if (!requireNamespace("biomaRt", quietly = TRUE)) install.packages("biomaRt")
    4. library(biomaRt)
    5. # === STEP 1: Load DEG files ===
    6. ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt #20086
    7. ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt #634
    8. ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt #23832
    9. ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt #375
    10. up_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt"
    11. down_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt"
    12. up_degs <- read.table(up_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    13. down_degs <- read.table(down_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    14. # === STEP 2: Clean & extract Ensembl gene IDs ===
    15. up_ids <- unique(na.omit(up_degs[[1]]))
    16. down_ids <- unique(na.omit(down_degs[[1]]))
    17. # Optional: protein-coding filter
    18. # up_ids <- unique(up_degs$external_gene_name[up_degs$gene_biotype == "protein_coding"])
    19. # down_ids <- unique(down_degs$external_gene_name[down_degs$gene_biotype == "protein_coding"])
    20. # === STEP 3: Connect to Ensembl Asia mirror ===
    21. ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://asia.ensembl.org")
    22. # === STEP 4: Robust, minimal-loss sequence fetch ===
    23. get_sequences <- function(ids, seqType, batch_size = 300, retry_depth = 3) {
    24. all_seqs <- list()
    25. failed_ids <- c()
    26. safe_batch_query <- function(batch_ids, depth = 1) {
    27. if (length(batch_ids) == 0) return(NULL)
    28. tryCatch({
    29. seqs <- getSequence(id = batch_ids,
    30. type = "ensembl_gene_id",
    31. seqType = seqType,
    32. mart = ensembl)
    33. seqs <- seqs[seqs[[seqType]] != "", ]
    34. return(seqs)
    35. }, error = function(e) {
    36. if (depth < retry_depth && length(batch_ids) > 1) {
    37. # Split batch and retry each half
    38. split_batches <- split(batch_ids, ceiling(seq_along(batch_ids)/ceiling(length(batch_ids)/2)))
    39. result <- lapply(split_batches, safe_batch_query, depth = depth + 1)
    40. return(do.call(rbind, result))
    41. } else {
    42. message("❌ Final failure for ", length(batch_ids), " genes. Logging IDs.")
    43. failed_ids <<- c(failed_ids, batch_ids)
    44. return(NULL)
    45. }
    46. })
    47. }
    48. id_batches <- split(ids, ceiling(seq_along(ids) / batch_size))
    49. for (i in seq_along(id_batches)) {
    50. batch <- id_batches[[i]]
    51. cat(sprintf(" 🔹 Batch %d/%d (%d genes)\n", i, length(id_batches), length(batch)))
    52. result <- safe_batch_query(batch)
    53. if (!is.null(result) && nrow(result) > 0) {
    54. all_seqs[[length(all_seqs) + 1]] <- result
    55. }
    56. }
    57. if (length(failed_ids) > 0) {
    58. writeLines(failed_ids, paste0("failed_ids_", seqType, ".txt"))
    59. message("⚠️ Failed IDs saved to failed_ids_", seqType, ".txt")
    60. }
    61. if (length(all_seqs) == 0) return(data.frame())
    62. return(do.call(rbind, all_seqs))
    63. }
    64. # === STEP 5: Write FASTA ===
    65. write_fasta <- function(sequences, names, file) {
    66. con <- file(file, "w")
    67. for (i in seq_along(sequences)) {
    68. cat(">", names[i], "\n", sequences[i], "\n", file = con, sep = "")
    69. }
    70. close(con)
    71. }
    72. # === STEP 6: Background sampling ===
    73. cat("📦 Fetching background gene list...\n")
    74. all_deg_ids <- unique(c(up_ids, down_ids))
    75. all_genes <- getBM(attributes = c("ensembl_gene_id"), mart = ensembl)
    76. bg_ids <- setdiff(all_genes$ensembl_gene_id, all_deg_ids)
    77. bg_sample <- sample(bg_ids, 1000)
    78. # === STEP 7: Fetch sequences by group and type ===
    79. seq_types <- c("3utr", "5utr", "cds", "transcript")
    80. groups <- list(
    81. up = up_ids,
    82. down = down_ids,
    83. background = bg_sample
    84. )
    85. for (seq_type in seq_types) {
    86. cat(sprintf("\n⏳ Fetching %s sequences...\n", seq_type))
    87. for (group_name in names(groups)) {
    88. cat(sprintf(" 🔸 Group: %s\n", group_name))
    89. ids <- groups[[group_name]]
    90. seqs <- get_sequences(ids, seq_type)
    91. if (nrow(seqs) > 0) {
    92. out_file <- paste0(seq_type, "_", group_name, ".fasta")
    93. write_fasta(seqs[[seq_type]], seqs$ensembl_gene_id, out_file)
    94. cat(" ✅ Written to", out_file, "\n")
    95. } else {
    96. message("⚠️ No sequences found for ", group_name, " (", seq_type, ")")
    97. }
    98. }
    99. }
    100. cat("\n✅ All FASTA files created for: 3′UTR, 5′UTR, CDS, and Transcript.\n")
  2. Using https://rbpmap.technion.ac.il/1747409685/results.html, how to predict p-value for a specific RBP?

    1. #Try using RBPmap
    2. ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/run2/3utr_down.fasta
  3. Significant RBP enrichments

    1. install.packages("BiocManager")
    2. BiocManager::install("BSgenome")
    3. BiocManager::install("Biostrings")
    4. BiocManager::install("motifmatchr")
    5. #BiocManager::install("MotifDb")
    6. #BiocManager::install("motifStack")
    7. BiocManager::install("TFBSTools")
    8. #install.packages("pheatmap")
    9. library(Matrix)
    10. library(pheatmap)
    11. library(Biostrings)
    12. library(motifmatchr)
    13. library(GenomicRanges)
    14. #library(MotifDb)
    15. #library(motifStack)
    16. library(TFBSTools)
    17. library(SummarizedExperiment)
    18. library(ggplot2)
    19. library(pheatmap)
    20. library(PWMEnrich)
    21. library(PWMEnrich.Hsapiens.background)
    22. # ------------- Methods -----------
    23. clean_fasta_file <- function(infile, outfile) {
    24. lines <- readLines(infile)
    25. # Initialize variables to store cleaned lines
    26. cleaned_lines <- list()
    27. add_line <- FALSE # To track when we need to add a sequence
    28. for (line in lines) {
    29. # If the line is a header and contains "Sequence unavailable", skip it
    30. if (startsWith(line, ">") && grepl("Sequence unavailable", line)) {
    31. add_line <- FALSE # Don't add this header or its sequence
    32. } else if (startsWith(line, ">")) {
    33. # If it's a new header, start adding the following sequence
    34. add_line <- TRUE
    35. cleaned_lines <- c(cleaned_lines, line)
    36. } else if (add_line) {
    37. # Add the sequence line if it follows a valid header
    38. cleaned_lines <- c(cleaned_lines, line)
    39. }
    40. }
    41. # Write cleaned lines to the output file
    42. writeLines(cleaned_lines, outfile)
    43. }
    44. # Function to clean sequences by replacing invalid characters with 'N'
    45. clean_invalid_sequences <- function(dna_string_set) {
    46. valid_bases <- c("A", "T", "C", "G", "N") # Define valid DNA bases
    47. clean_sequences <- lapply(dna_string_set, function(seq) {
    48. # Remove invalid characters, replace them with 'N'
    49. seq <- gsub("[^ATCGN]", "N", as.character(seq)) # Replace non-ATCGN characters with 'N'
    50. DNAString(seq) # Return as a valid DNAString object
    51. })
    52. return(DNAStringSet(clean_sequences)) # Return cleaned sequences as a DNAStringSet
    53. }
    54. # ============================================
    55. # 2. Read and clean 3′ UTR sequences
    56. # ============================================
    57. # Clean your FASTA file
    58. clean_fasta_file("run2/3utr_up.fasta", "run2/3utr_up_cleaned.fasta")
    59. clean_fasta_file("run2/3utr_down.fasta", "run2/3utr_down_cleaned.fasta")
    60. clean_fasta_file("run2/3utr_background.fasta", "run2/3utr_background_cleaned.fasta")
    61. clean_fasta_file("run2/5utr_up.fasta", "run2/5utr_up_cleaned.fasta")
    62. clean_fasta_file("run2/5utr_down.fasta", "run2/5utr_down_cleaned.fasta")
    63. clean_fasta_file("run2/5utr_background.fasta", "run2/5utr_background_cleaned.fasta")
    64. # Read in the cleaned FASTA files
    65. utr_3_up_raw <- readDNAStringSet("run2/3utr_up_cleaned.fasta")
    66. utr_3_down_raw <- readDNAStringSet("run2/3utr_down_cleaned.fasta")
    67. utr_3_background_raw <- readDNAStringSet("run2/3utr_background_cleaned.fasta")
    68. utr_5_up_raw <- readDNAStringSet("run2/5utr_up_cleaned.fasta")
    69. utr_5_down_raw <- readDNAStringSet("run2/5utr_down_cleaned.fasta")
    70. utr_5_background_raw <- readDNAStringSet("run2/5utr_background_cleaned.fasta")
    71. # Clean sequences
    72. utr_3_up <- clean_invalid_sequences(utr_3_up_raw)
    73. utr_3_down <- clean_invalid_sequences(utr_3_down_raw)
    74. utr_3_background <- clean_invalid_sequences(utr_3_background_raw)
    75. utr_5_up <- clean_invalid_sequences(utr_5_up_raw)
    76. utr_5_down <- clean_invalid_sequences(utr_5_down_raw)
    77. utr_5_background <- clean_invalid_sequences(utr_5_background_raw)
    78. # Filter out sequences shorter than the shortest PWM (e.g. 6 bp)
    79. # and any sequences containing ambiguous bases (non‑ACGT)
    80. min_pwm_length <- 6
    81. utr_3_down_clean <- utr_3_down[
    82. width(utr_3_down) >= min_pwm_length &
    83. !grepl("[^ACGT]", as.character(utr_3_down))
    84. ]
    85. #cat("Kept", length(utr_down_clean), "sequences after cleaning.\n")
    86. utr_3_background_clean <- utr_3_background[
    87. width(utr_3_background) >= min_pwm_length &
    88. !grepl("[^ACGT]", as.character(utr_3_background))
    89. ]
    90. # Install packages (if not already)
    91. if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
    92. BiocManager::install(c("PWMEnrich", "Biostrings", "PWMEnrich.Hsapiens.background", "MotifDb"))
    93. BiocManager::install("PWMEnrich.Hsapiens.background")
    94. # Load libraries
    95. library(PWMEnrich)
    96. library(Biostrings)
    97. library(PWMEnrich.Hsapiens.background)
    98. library(MotifDb)
    99. # ============================================
    100. # 3. Load the hg19 GC‑aware background model
    101. # ============================================
    102. data("PWMLogn.hg19.MotifDb.Hsap",
    103. package = "PWMEnrich.Hsapiens.background")
    104. bg_model <- PWMLogn.hg19.MotifDb.Hsap
    105. # ============================================
    106. # 4. Define your list of RBP gene symbols
    107. # ============================================
    108. # Specific RBP gene symbols
    109. rbp_genes <- c("A1CF", "A2BP1", "AC004381.6", "AC008073.5", "ACO1", "AKAP1", "AKAP17A", "AL662825.2", "AL662825.3", "AL662849.4", "AL844853.7", "ALKBH8", "ANKHD1", "ANKRD17", "APLF", "ASCC1", "BICC1", "BOLL", "BRAP", "BRUNOL4", "BRUNOL5", "BRUNOL6", "BX000357.2", "BX005143.2", "BX119957.7", "BX248507.1", "BX248518.2", "BX511012.1", "BX908728.4", "BX927220.1", "C14orf156", "C14orf21", "CARHSP1", "CELF1", "CELF2", "CELF3", "CIRBP", "CNBP", "CNOT4", "CPEB1", "CPEB2", "CPEB3", "CPEB4", "CPSF4", "CPSF4L", "CPSF6", "CPSF7", "CR388219.1", "CR388372.1", "CR753328.1", "CR759778.5", "CR759782.5", "CR847863.1", "CSDA", "CSDC2", "CSDE1", "CSTF2", "CSTF2T", "DAZ1", "DAZ2", "DAZ3", "DAZ4", "DAZAP1", "DAZL", "DDX41", "DDX43", "DDX53", "DHX57", "DHX8", "DNAJC17", "DPPA5", "DUS3L", "EIF2S1", "EIF3B", "EIF3G", "EIF4B", "EIF4H", "ELAVL1", "ELAVL2", "ELAVL3", "ELAVL4", "ENOX1", "ENOX2", "ENSG00000180771", "ENSG00000183403", "ENSG00000185728", "ENSG00000213250", "ENSG00000215042", "ENSG00000215492", "ENSG00000231942", "ENSG00000248163", "ENSG00000249536", "ENSG00000249644", "ENSG00000250177", "ERAL1", "ESRP1", "ESRP2", "EWSR1", "FMR1", "FUBP1", "FUBP3", "FUS", "Fusip1", "FXR1", "FXR2", "G3BP1", "G3BP2", "GRSF1", "HDLBP", "HELZ", "HNRNPA0", "HNRNPA1", "HNRNPA1L2", "HNRNPA2B1", "HNRNPA3", "HNRNPAB", "HNRNPC", "HNRNPCL1", "HNRNPD", "HNRNPF", "HNRNPH1", "HNRNPH2", "HNRNPH3", "hnRNPK", "HNRNPL", "hnRNPLL", "HNRNPM", "HNRNPR", "HNRPDL", "HTATSF1", "IGF2BP1", "IGF2BP2", "IGF2BP3", "KHDRBS1", "KHDRBS2", "KHDRBS3", "KHSRP", "KIAA0430", "KRR1", "LARP1", "LARP1B", "LARP4", "LARP4B", "LARP6", "LARP7", "LENG9", "LIN28A", "LIN28B", "MATR3", "MBNL1", "MBNL2", "MBNL3", "MDM2", "MEX3B", "MEX3C", "MEX3D", "MKI67IP", "MKRN1", "MKRN2", "MKRN3", "MRPS28", "MSI1", "MSI2", "MTHFSD", "MYEF2", "NCBP2", "NCBP2L", "NCL", "NEIL3", "NOL8", "NONO", "NOVA1", "NOVA2", "NPLOC4", "NUP153", "NUPL2", "PABPC1", "PABPC1L", "PABPC1L2A", "PABPC1L2B", "PABPC3", "PABPC4", "PABPC5", "PABPN1", "PABPN1L", "PAN3", "PARP12", "PCBP1", "PCBP2", "PCBP3", "PCBP4", "PDCD11", "PEG10", "PNMA3", "PNO1", "PNPT1", "POLDIP3", "POLR2G", "PPARGC1A", "PPARGC1B", "PPIE", "PPIL4", "PPP1R10", "PPRC1", "PRR3", "PSPC1", "PTBP1", "PTBP2", "PUF60", "PUM1", "PUM2", "QKI", "RALY", "RANBP2", "RAVER1", "RAVER2", "RBBP6", "RBCK1", "RBFOX2", "RBFOX3", "RBM10", "RBM11", "RBM12", "RBM12B", "RBM14-RBM4", "RBM15", "RBM15B", "RBM17", "RBM18", "RBM19", "RBM22", "RBM23", "RBM24", "RBM25", "RBM26", "RBM27", "RBM28", "RBM3", "RBM33", "RBM34", "RBM38", "RBM39", "RBM4", "RBM41", "RBM42", "RBM44", "RBM45", "RBM46", "RBM47", "RBM4B", "RBM5", "RBM6", "RBM7", "RBM8A", "RBMS1", "RBMS2", "RBMS3", "RBMX", "RBMX2", "RBMXL1", "RBMXL2", "RBMXL3", "RBMY1A1", "RBMY1B", "RBMY1D", "RBMY1E", "RBMY1F", "RBMY1J", "RBPMS", "RBPMS2", "RBP_Name", "RC3H1", "RC3H2", "RCAN2", "RDBP", "RDM1", "RNF113A", "RNF113B", "RNF31", "RNPC3", "RNPS1", "ROD1", "RP11-1286E23.4", "RRP7A", "RYBP", "SAFB", "SAFB2", "SAMD4A", "SAMD4B", "SART3", "SCAF4", "SCAF8", "SETD1A", "SETD1B", "SF1", "SF3B4", "SFPQ", "SHARPIN", "SLTM", "SNRNP35", "SNRNP70", "SNRPA", "SNRPB2", "SOLH", "SPEN", "SRBD1", "SREK1", "SRRT", "SRSF1", "SRSF11", "SRSF12", "SRSF2", "SRSF3", "SRSF4", "SRSF5", "SRSF6", "SRSF7", "SRSF9", "SSB", "STAR-PAP", "SUPT6H", "SYNCRIP", "TAB2", "TAB3", "TAF15", "TARDBP", "TDRD10", "TDRKH", "TEX13A", "THOC4", "TIA1", "TIAL1", "TMEM63A", "TMEM63B", "TNRC6A", "TNRC6B", "TNRC6C", "TOE1", "TRA2A", "TRA2B", "TRMT1", "TRMT2A", "TRNAU1AP", "TTC14", "U2AF1", "U2AF1L4", "U2AF2", "U2SURP", "UHMK1", "UNK", "UNKL", "UPF3B", "YAF2", "YB-1", "YBX2", "YTHDC1", "YTHDC2", "YTHDF1", "YTHDF2", "ZC3H10", "ZC3H13", "ZC3H15", "ZC3H18", "ZC3H3", "ZC3H4", "ZC3H6", "ZC3H7A", "ZC3H7B", "ZC3H8", "ZCCHC11", "ZCCHC13", "ZCCHC14", "ZCCHC17", "ZCCHC2", "ZCCHC3", "ZCCHC5", "ZCCHC6", "ZCCHC7", "ZCCHC8", "ZCCHC9", "ZCRB1", "ZFP36", "ZFP36L1", "ZFP36L2", "ZGPAT", "ZMAT5", "ZNF638", "ZRANB1", "ZRANB2", "ZRANB3", "ZRSR1", "ZRSR2")
    110. # ============================================
    111. # 5. Extract all PWMs from background & filter to RBPs
    112. # ============================================
    113. #all_pwms <- bg_model@pwms #2287
    114. #rbp_pwms <- all_pwms[
    115. # vapply(names(all_pwms),
    116. # function(nm) any(nm %in% rbp_genes),
    117. # logical(1))
    118. #]
    119. # 如果你想做模糊匹配(基因名作为子串出现),可以用:
    120. rbp_pwms <- all_pwms[grepl(paste(rbp_genes, collapse="|"), names(all_pwms), ignore.case=TRUE)]
    121. cat("Number of RBP PWMs selected:", length(rbp_pwms), "\n")
    122. cat("Example PWM names:\n", head(names(rbp_pwms), 10), "\n")
    123. # ============================================
    124. # 6. Create a custom background model with only RBP PWMs
    125. # ============================================
    126. custom_bg <- bg_model
    127. custom_bg@pwms <- rbp_pwms
    128. # ============================================
    129. # 7. Run motif enrichment
    130. # ============================================
    131. enrichment_result <- motifEnrichment(utr_3_down_clean, custom_bg)
    132. # ============================================
    133. # 8. Generate report & convert to data.frame
    134. # ============================================
    135. report_list <- groupReport(enrichment_result)
    136. report_df <- as.data.frame(report_list)
    137. # ============================================
    138. # 9. Apply BH correction and filter significant hits
    139. # ============================================
    140. report_df$adj_pval <- p.adjust(report_df$p.value, method = "BH")
    141. significant_hits <- subset(report_df, adj_pval < 0.05)
    142. # ============================================
    143. # 10. View results
    144. # ============================================
    145. # Display motif name, raw p-value, adjusted p-value, and enrichment score
    146. colnames(significant_hits)[
    147. match(c("target","id","raw.score"), colnames(significant_hits))
    148. ] <- c("Gene", "Motif_ID", "EnrichmentScore")
    149. print(
    150. head(
    151. significant_hits[, c("Gene","Motif_ID","EnrichmentScore","p.value","adj_pval")],
    152. 6
    153. )
    154. )
    155. p <- ggplot(significant_hits,
    156. aes(x = reorder(Motif_ID, adj_pval),
    157. y = -log10(adj_pval))) +
    158. geom_col() +
    159. coord_flip() +
    160. labs(
    161. x = "Motif ID",
    162. y = expression(-log[10]("FDR‑adjusted p‑value")),
    163. title = "Significant RBP Motif Enrichment"
    164. ) +
    165. theme_minimal()
    166. ggsave(
    167. filename = "RBP_motif_enrichment.png", # or any path you like
    168. plot = p,
    169. width = 8, # inches
    170. height = 6, # inches
    171. dpi = 300 # resolution
    172. )
  4. Perform similar RBP-mapping by myself

    1. # Load a set of RBP motifs from a database (e.g., RBPDB) RBPDB, ENCODE, JASPAR, or even UCL's RBP motifs.
    2. # Set path to your PWM directory
    3. pfm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PFMDir"
    4. # List all .pfm files
    5. pfm_files <- list.files(pfm_dir, pattern = "\\.pfm$", full.names = TRUE)
    6. # Create an empty list to store PWMs
    7. pfm_list <- list()
    8. # Loop through each file
    9. for (file in pfm_files) {
    10. # Read table
    11. pfm <- tryCatch({
    12. mat <- read.table(file, header = FALSE)
    13. mat <- as.matrix(mat)
    14. # Optionally add column names (assuming 4 columns for A, C, G, T)
    15. if (ncol(mat) == 4) {
    16. colnames(mat) <- c("A", "C", "G", "T")
    17. }
    18. mat
    19. }, error = function(e) {
    20. message("Failed to read: ", file)
    21. NULL
    22. })
    23. # Use the filename (without extension) as list name
    24. name <- tools::file_path_sans_ext(basename(file))
    25. pfm_list[[name]] <- pfm
    26. }
    27. # Check one example
    28. str(pfm_list[[1]])
    29. #saveRDS(pfm_list, file = "compiled_pfm_list.rds")
    30. #The error you're seeing (invalid PFM 'x': not an integer matrix) indicates that the PWM function expects the input matrix to represent raw position frequency matrices (PFMs), which are expected to be integer counts, but the matrices you have are likely log-odds matrices (floating point values), which represent the log-transformed scores instead of raw counts.
    31. # convert to PFMatrix
    32. pfm_named_list <- lapply(pfm_list, function(mat) {
    33. mat <- as.matrix(mat)
    34. if (!all(rownames(mat) %in% c("A", "C", "G", "T"))) {
    35. rownames(mat) <- c("A", "C", "G", "T")
    36. }
    37. PFMatrix(ID = "motif", name = "motif", profileMatrix = mat)
    38. })
    39. # Create an empty list to store results
    40. match_results_list <- list()
    41. # Iterate through each PFMatrix object in the list
    42. for (i in seq_along(pfm_named_list)) {
    43. # Get the current PFMatrix object
    44. current_pfm <- pfm_named_list[[i]]
    45. # Run matchMotifs for the current motif
    46. match_result <- tryCatch({
    47. #matchMotifs(pfm_obj, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
    48. matchMotifs(current_pfm, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
    49. }, error = function(e) {
    50. message("Error with motif ", names(pfm_named_list)[i], ": ", e$message)
    51. return(NULL) # Return NULL if error occurs
    52. })
    53. # Store the result if no error
    54. if (!is.null(match_result)) {
    55. match_results_list[[names(pfm_named_list)[i]]] <- match_result
    56. }
    57. }
    58. # Check the results
    59. match_results_list
    60. # Extract motif match results for one motif
    61. motif_matches_1004_8676391 <- assay(match_results_list[["1004_8676391"]], "motifMatches")
    62. # Convert sparse matrix to dense matrix
    63. dense_matrix <- as.matrix(motif_matches_1004_8676391)
    64. # Optional: Add rownames for better visualization (if available)
    65. if (!is.null(names(utr_3_down))) {
    66. rownames(dense_matrix) <- names(utr_3_down)
    67. }
    68. # Convert logical matrix to numeric (1 for TRUE, 0 for FALSE)
    69. dense_matrix_num <- matrix(as.numeric(dense_matrix),
    70. nrow = nrow(dense_matrix),
    71. dimnames = dimnames(dense_matrix))
    72. # Now plot the heatmap
    73. png("motif_match_1004_8676391.png", width = 800, height = 1600)
    74. pheatmap(dense_matrix_num,
    75. cluster_rows = FALSE,
    76. cluster_cols = FALSE,
    77. color = c("grey", "steelblue"),
    78. legend_breaks = c(0, 1),
    79. legend_labels = c("No match", "Match"),
    80. main = "Motif Match: 1004_8676391")
    81. dev.off()
    82. #✅ Output
    83. #You will see a binary heatmap, where:
    84. # * Blue cells = motif matched
    85. # * White cells = no match
    86. #Let me know if you want to loop this over all motifs or save the plot to a file.
  5. Does hg19 vs hg38 Position Matter in Motif Enrichment?

    1. If you're doing motif enrichment analysis only (like PWMEnrich):
    2. Position DOES NOT matter.
    3. Here's why:
    4. Aspect Explanation
    5. 🔍 PWMEnrich Only scans sequences for motif matches based on nucleotide content (e.g., enrichment of AU-rich motifs in downregulated 3UTRs).
    6. 📏 Coordinates Genomic coordinates (hg19 vs hg38) are not used in enrichment. Only the sequence itself matters.
    7. ⚖️ Statistical Test The enrichment is based on observed motif frequency vs background expectations (GC-aware or custom). No genome alignment is done.
    8. So:
    9. As long as your 3 UTR sequences are correct and biologically relevant, using an hg19 background is safe even if your gene models are from hg38.
    10. ❗️ When does genome version matter?
    11. If you're doing things like:
    12. Use case Position-sensitive?
    13. 🔬 Variant effect on motifs (e.g., SNPs in motifs) ✅ Yes
    14. 🧭 Mapping motifs to genome positions (e.g., ChIP-seq overlaps) ✅ Yes
    15. 📦 Loading sequences from coordinates (not FASTA) ✅ Yes
    16. 🧠 Expression + genomic track integration (e.g., motif + ATAC-seq) ✅ Yes
    17. Then you must ensure genome versions match exactly.
    18. ✅ In Your Case: Motif Enrichment of Downregulated 3′UTRs
    19. If you are:
    20. Feeding real 3′UTR sequences (as DNAStringSet) for downregulated genes
    21. Using PWMEnrich purely for motif enrichment
    22. Then:
    23. ✅ hg19 vs hg38 does not matter
    24. ❗ Only sequence content matters
    25. Let me know if you want to later map enriched motifs back to genome positions — that’s when we’d switch to position-aware tools like FIMO, MOODS, or rtracklayer.
  6. Custom RBP-database preparation (failed!)

    1. # Example: Create a PWM for 2 fake motifs (replace with your actual PWMs)
    2. #motif1 <- matrix(c(0.2, 0.3, 0.3, 0.2,
    3. # 0.1, 0.4, 0.4, 0.1,
    4. # 0.25, 0.25, 0.25, 0.25,
    5. # 0.3, 0.2, 0.3, 0.2), nrow = 4, byrow = TRUE,
    6. # dimnames = list(c("A", "C", "G", "T"), NULL))
    7. #motif2 <- matrix(c(0.3, 0.2, 0.2, 0.3,
    8. # 0.4, 0.1, 0.4, 0.1,
    9. # 0.25, 0.25, 0.25, 0.25,
    10. # 0.2, 0.3, 0.2, 0.3), nrow = 4, byrow = TRUE,
    11. # dimnames = list(c("A", "C", "G", "T"), NULL))
    12. #rbp_pwms <- list(RBP1 = motif1, RBP2 = motif2)
    13. pwm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PWMDir"
    14. pwm_files <- list.files(pwm_dir, pattern = "\\.pwm$", full.names = TRUE)
    15. rbp_pwms <- list()
    16. for (file in pwm_files) {
    17. # Read table
    18. pwm <- tryCatch({
    19. mat <- read.table(file, header = FALSE)
    20. mat <- as.matrix(mat)
    21. # Optionally add column names (assuming 4 columns for A, C, G, T)
    22. if (ncol(mat) == 4) {
    23. colnames(mat) <- c("A", "C", "G", "T")
    24. }
    25. mat
    26. }, error = function(e) {
    27. message("Failed to read: ", file)
    28. NULL
    29. })
    30. # Use the filename (without extension) as list name
    31. name <- tools::file_path_sans_ext(basename(file))
    32. rbp_pwms[[name]] <- pwm
    33. }
    34. # -- filtering (ERROR) --
    35. pwm_list <- motifs(bg_model)
    36. # Example: filter motifs containing 'ELAVL' or 'HNRNP'
    37. #rbp_pwms <- pwm_list[grep("ELAVL|HNRNP|IGF2BP|RBFOX|FMR1|PUM", names(pwm_list))]
    38. rbp_enrichment <- motifEnrichment(utr_3_down, bg_model, pwmList = rbp_pwms)
    39. rbp_report <- groupReport(rbp_enrichment)
    40. # -- from MotifDb --
    41. library(MotifDb)
    42. human_motifs <- query(MotifDb, "Hsapiens")
    43. rbp_motifs <- query(human_motifs, "RBP")
    44. # ---- try Build your background model directly from these PWMs ----
    45. # 1. List all PWM files
    46. files_all <- list.files("~/Downloads/cisbp-rna/pwms_all_motifs", pattern="\\.txt$", full.names=TRUE)
    47. # 2. Filter out empty files
    48. file_sizes <- file.info(files_all)$size
    49. files <- files_all[file_sizes > 0]
    50. cat("Reading", length(files), "non‑empty PWM files out of", length(files_all), "\n")
    51. # 3. Define the reader (as before)
    52. readCisbpPfm <- function(path) {
    53. df <- read.table(path, header=TRUE, stringsAsFactors=FALSE)
    54. mat_rna <- as.matrix(df[, c("A","C","G","U")])
    55. mat_dna <- t(mat_rna)
    56. rownames(mat_dna) <- c("A","C","G","T")
    57. sweep(mat_dna, 2, colSums(mat_dna), FUN="/")
    58. }
    59. # 4. Read only the non‐empty files
    60. cisbp_mats <- lapply(files, readCisbpPfm)
    61. names(cisbp_mats) <- tools::file_path_sans_ext(basename(files))
    62. # 5. (Optional) Check the first few
    63. head(names(cisbp_mats))
    64. length(cisbp_mats) #193
    65. # 6. Build your background model directly from these PWMs:
    66. bg_utrs <- utr_3_background_clean #readDNAStringSet("run2/3utr_background.fasta")
    67. #bg_utrs <- bg_utrs[ width(bg_utrs)>=6 & !grepl("[^ACGT]", as.character(bg_utrs)) ]
    68. custom_bg <- makeBackground(
    69. seqs = bg_utrs,
    70. motifs = cisbp_mats,
    71. type = "logn",
    72. organism = "hg19",
    73. verbose = TRUE
    74. )
    75. custom_bg <- makeBackground(
    76. sequences = bg_utrs, # your DNAStringSet of background UTRs
    77. motifs = cisbp_mats, # your named list of PWM matrices
    78. type = "logn", # log‑normal GC‑aware model
    79. organism = NULL, # disable built‑in genomes
    80. verbose = TRUE
    81. )
    82. custom_bg <- makeBackground(
    83. cisbp_mats, # 1) motifs
    84. organism = "hg19",
    85. type = "logn", # 2) background model type
    86. bg.seq = bg_utrs, # 3) override organism by supplying your own sequences
    87. quick = FALSE # (optional) fit to all sequences for best accuracy
    88. )
    89. # 1. Pick a library size to re‑scale your PWMs into PFMs
    90. library_size <- 1000 # you can also try 1000
    91. # 2. Convert each normalized matrix into integer counts
    92. cisbp_pfms <- lapply(cisbp_mats, function(pmat) {
    93. counts <- round(pmat * library_size)
    94. # ensure no column sums to zero (add pseudocount 1 if needed)
    95. zero_cols <- which(colSums(counts) == 0)
    96. if (length(zero_cols) > 0) {
    97. counts[, zero_cols] <- 1
    98. }
    99. counts
    100. })
    101. # 3. Build a custom background (PFMs + your bg UTRs)
    102. custom_bg <- makeBackground(
    103. cisbp_pfms, # 1) a list of integer PFMs
    104. type = "logn", # 2) log‑normal GC‑aware model
    105. bg.seq = bg_utrs,# 3) your background DNAStringSet
    106. quick = FALSE # fit on all sequences
    107. )
    108. # … then continue with motifEnrichment() on your down‑regulated UTRs as before.
    109. # 5) Read & clean your down‑regulated 3′UTRs
    110. down_utrs <- readDNAStringSet("downregulated_utrs.fasta")
    111. down_utrs <- down_utrs[ width(down_utrs)>=6 & !grepl("[^ACGT]", as.character(down_utrs)) ]
    112. # 6) Run the enrichment
    113. res <- motifEnrichment(down_utrs, custom_bg)
    114. report <- groupReport(res)
    115. df <- as.data.frame(report)
    116. df$adj_pval <- p.adjust(df$p.value, method="BH")
    117. # 7) Subset significant
    118. sig <- subset(df, adj_pval < 0.05)
    119. print(sig[, c("motif.name","p.value","adj_pval","enrichmentScore")])

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum