Analysis of the RNA binding protein motifs for RNA Seq

gene_x 0 like s 571 view s

Tags: pipeline

TODOs_NEXTWEEK: how to use RBPmap results to enrich the significant RBPs for RNA-seq (3' UTRs?) and miRNAs?

    🧬 1. RBPmap (Web + Command-line)

        Type: Web tool and downloadable Perl scripts

        Purpose: Scans RNA sequences for known RBP motifs

        Database: Uses curated RBP motif sets (e.g., from literature, CLIP data)

        URL: https://rbpmap.technion.ac.il/

    ✅ Good for: Biologists without strong R background
    🚫 Not a package, but scriptable via CLI or HTML results

filtering out RBP from the database

improve the output of the 3utr_down_cleaned.fasta

multiple testing

perform this for miRNA.

http://xgenes.com/article/article-content/146/peak-and-motif-analyses-in-promoters/

http://xgenes.com/article/article-content/154/density-of-motif-plots-and-its-statistical-tests/

There are several alternative R packages and tools to perform motif enrichment analysis for RNA-binding proteins (RBPs), beyond PWMEnrich::motifEnrichment(). Here are the most notable ones:

| ------------------------ | ----------------- | --------------- | --------- | -------------- |

| PWMEnrich | ✅ | ✅ | R | ✅ |

| RBPmap | ✅ | ❌ (uses own db) | Web/CLI | ✅ | ----> try RBPmap_results + enrichments!

| Biostrings/TFBSTools | ❌ (only scanning) | ✅ | R | ❌ | #ATtRACT + Biostrings / TFBSTools

| rmap | ✅ (CLIP-based) | ❌ | R | ✅ |

| Homer | ✅ | ✅ | CLI | ⚠ RNA optional |

Using R to get 3UTR_sequences.fasta, 5UTR

setwd("~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis")

# === STEP 0: Load libraries ===
if (!requireNamespace("biomaRt", quietly = TRUE)) install.packages("biomaRt")
library(biomaRt)

# === STEP 1: Load DEG files ===
~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt    #20086
~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt  #634
~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt     #23832
~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt   #375
up_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt"
down_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt"

up_degs <- read.table(up_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
down_degs <- read.table(down_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# === STEP 2: Clean & extract Ensembl gene IDs ===
up_ids <- unique(na.omit(up_degs[[1]]))
down_ids <- unique(na.omit(down_degs[[1]]))

# Optional: protein-coding filter
# up_ids <- unique(up_degs$external_gene_name[up_degs$gene_biotype == "protein_coding"])
# down_ids <- unique(down_degs$external_gene_name[down_degs$gene_biotype == "protein_coding"])

# === STEP 3: Connect to Ensembl Asia mirror ===
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://asia.ensembl.org")

# === STEP 4: Robust, minimal-loss sequence fetch ===
get_sequences <- function(ids, seqType, batch_size = 300, retry_depth = 3) {
all_seqs <- list()
failed_ids <- c()

safe_batch_query <- function(batch_ids, depth = 1) {
    if (length(batch_ids) == 0) return(NULL)

    tryCatch({
    seqs <- getSequence(id = batch_ids,
                        type = "ensembl_gene_id",
                        seqType = seqType,
                        mart = ensembl)
    seqs <- seqs[seqs[[seqType]] != "", ]
    return(seqs)
    }, error = function(e) {
    if (depth < retry_depth && length(batch_ids) > 1) {
        # Split batch and retry each half
        split_batches <- split(batch_ids, ceiling(seq_along(batch_ids)/ceiling(length(batch_ids)/2)))
        result <- lapply(split_batches, safe_batch_query, depth = depth + 1)
        return(do.call(rbind, result))
    } else {
        message("❌ Final failure for ", length(batch_ids), " genes. Logging IDs.")
        failed_ids <<- c(failed_ids, batch_ids)
        return(NULL)
    }
    })
}

id_batches <- split(ids, ceiling(seq_along(ids) / batch_size))
for (i in seq_along(id_batches)) {
    batch <- id_batches[[i]]
    cat(sprintf("    🔹 Batch %d/%d (%d genes)\n", i, length(id_batches), length(batch)))
    result <- safe_batch_query(batch)
    if (!is.null(result) && nrow(result) > 0) {
    all_seqs[[length(all_seqs) + 1]] <- result
    }
}

if (length(failed_ids) > 0) {
    writeLines(failed_ids, paste0("failed_ids_", seqType, ".txt"))
    message("⚠️ Failed IDs saved to failed_ids_", seqType, ".txt")
}

if (length(all_seqs) == 0) return(data.frame())
return(do.call(rbind, all_seqs))
}

# === STEP 5: Write FASTA ===
write_fasta <- function(sequences, names, file) {
con <- file(file, "w")
for (i in seq_along(sequences)) {
    cat(">", names[i], "\n", sequences[i], "\n", file = con, sep = "")
}
close(con)
}

# === STEP 6: Background sampling ===
cat("📦 Fetching background gene list...\n")
all_deg_ids <- unique(c(up_ids, down_ids))
all_genes <- getBM(attributes = c("ensembl_gene_id"), mart = ensembl)
bg_ids <- setdiff(all_genes$ensembl_gene_id, all_deg_ids)
bg_sample <- sample(bg_ids, 1000)

# === STEP 7: Fetch sequences by group and type ===
seq_types <- c("3utr", "5utr", "cds", "transcript")
groups <- list(
up = up_ids,
down = down_ids,
background = bg_sample
)

for (seq_type in seq_types) {
cat(sprintf("\n⏳ Fetching %s sequences...\n", seq_type))
for (group_name in names(groups)) {
    cat(sprintf("  🔸 Group: %s\n", group_name))
    ids <- groups[[group_name]]
    seqs <- get_sequences(ids, seq_type)
    if (nrow(seqs) > 0) {
    out_file <- paste0(seq_type, "_", group_name, ".fasta")
    write_fasta(seqs[[seq_type]], seqs$ensembl_gene_id, out_file)
    cat("    ✅ Written to", out_file, "\n")
    } else {
    message("⚠️ No sequences found for ", group_name, " (", seq_type, ")")
    }
}
}

cat("\n✅ All FASTA files created for: 3′UTR, 5′UTR, CDS, and Transcript.\n")

Using https://rbpmap.technion.ac.il/1747409685/results.html, how to predict p-value for a specific RBP?
```
#Try using RBPmap
~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/run2/3utr_down.fasta
```

Significant RBP enrichments

install.packages("BiocManager")
BiocManager::install("BSgenome")
BiocManager::install("Biostrings")
BiocManager::install("motifmatchr")
#BiocManager::install("MotifDb")
#BiocManager::install("motifStack")
BiocManager::install("TFBSTools")
#install.packages("pheatmap")

library(Matrix)
library(pheatmap)
library(Biostrings)
library(motifmatchr)
library(GenomicRanges)
#library(MotifDb)
#library(motifStack)
library(TFBSTools)
library(SummarizedExperiment)
library(ggplot2)
library(pheatmap)

library(PWMEnrich)
library(PWMEnrich.Hsapiens.background)

# ------------- Methods -----------

clean_fasta_file <- function(infile, outfile) {
lines <- readLines(infile)

# Initialize variables to store cleaned lines
cleaned_lines <- list()
add_line <- FALSE  # To track when we need to add a sequence

for (line in lines) {
    # If the line is a header and contains "Sequence unavailable", skip it
    if (startsWith(line, ">") && grepl("Sequence unavailable", line)) {
    add_line <- FALSE  # Don't add this header or its sequence
    } else if (startsWith(line, ">")) {
    # If it's a new header, start adding the following sequence
    add_line <- TRUE
    cleaned_lines <- c(cleaned_lines, line)
    } else if (add_line) {
    # Add the sequence line if it follows a valid header
    cleaned_lines <- c(cleaned_lines, line)
    }
}

# Write cleaned lines to the output file
writeLines(cleaned_lines, outfile)
}

# Function to clean sequences by replacing invalid characters with 'N'
clean_invalid_sequences <- function(dna_string_set) {
valid_bases <- c("A", "T", "C", "G", "N")  # Define valid DNA bases
clean_sequences <- lapply(dna_string_set, function(seq) {
    # Remove invalid characters, replace them with 'N'
    seq <- gsub("[^ATCGN]", "N", as.character(seq))  # Replace non-ATCGN characters with 'N'
    DNAString(seq)  # Return as a valid DNAString object
})
return(DNAStringSet(clean_sequences))  # Return cleaned sequences as a DNAStringSet
}

# ============================================
# 2. Read and clean 3′ UTR sequences
# ============================================
# Clean your FASTA file
clean_fasta_file("run2/3utr_up.fasta", "run2/3utr_up_cleaned.fasta")
clean_fasta_file("run2/3utr_down.fasta", "run2/3utr_down_cleaned.fasta")
clean_fasta_file("run2/3utr_background.fasta", "run2/3utr_background_cleaned.fasta")
clean_fasta_file("run2/5utr_up.fasta", "run2/5utr_up_cleaned.fasta")
clean_fasta_file("run2/5utr_down.fasta", "run2/5utr_down_cleaned.fasta")
clean_fasta_file("run2/5utr_background.fasta", "run2/5utr_background_cleaned.fasta")

# Read in the cleaned FASTA files
utr_3_up_raw <- readDNAStringSet("run2/3utr_up_cleaned.fasta")
utr_3_down_raw <- readDNAStringSet("run2/3utr_down_cleaned.fasta")
utr_3_background_raw <- readDNAStringSet("run2/3utr_background_cleaned.fasta")
utr_5_up_raw <- readDNAStringSet("run2/5utr_up_cleaned.fasta")
utr_5_down_raw <- readDNAStringSet("run2/5utr_down_cleaned.fasta")
utr_5_background_raw <- readDNAStringSet("run2/5utr_background_cleaned.fasta")

# Clean sequences
utr_3_up <- clean_invalid_sequences(utr_3_up_raw)
utr_3_down <- clean_invalid_sequences(utr_3_down_raw)
utr_3_background <- clean_invalid_sequences(utr_3_background_raw)
utr_5_up <- clean_invalid_sequences(utr_5_up_raw)
utr_5_down <- clean_invalid_sequences(utr_5_down_raw)
utr_5_background <- clean_invalid_sequences(utr_5_background_raw)

# Filter out sequences shorter than the shortest PWM (e.g. 6 bp)
# and any sequences containing ambiguous bases (non‑ACGT)
min_pwm_length <- 6
utr_3_down_clean <- utr_3_down[
width(utr_3_down) >= min_pwm_length &
!grepl("[^ACGT]", as.character(utr_3_down))
]
#cat("Kept", length(utr_down_clean), "sequences after cleaning.\n")
utr_3_background_clean <- utr_3_background[
width(utr_3_background) >= min_pwm_length &
!grepl("[^ACGT]", as.character(utr_3_background))
]

# Install packages (if not already)
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install(c("PWMEnrich", "Biostrings", "PWMEnrich.Hsapiens.background", "MotifDb"))
BiocManager::install("PWMEnrich.Hsapiens.background")

# Load libraries
library(PWMEnrich)
library(Biostrings)
library(PWMEnrich.Hsapiens.background)
library(MotifDb)

# ============================================
# 3. Load the hg19 GC‑aware background model
# ============================================
data("PWMLogn.hg19.MotifDb.Hsap",
    package = "PWMEnrich.Hsapiens.background")
bg_model <- PWMLogn.hg19.MotifDb.Hsap

# ============================================
# 4. Define your list of RBP gene symbols
# ============================================

# Specific RBP gene symbols
rbp_genes <- c("A1CF", "A2BP1", "AC004381.6", "AC008073.5", "ACO1", "AKAP1", "AKAP17A", "AL662825.2", "AL662825.3", "AL662849.4", "AL844853.7", "ALKBH8", "ANKHD1", "ANKRD17", "APLF", "ASCC1", "BICC1", "BOLL", "BRAP", "BRUNOL4", "BRUNOL5", "BRUNOL6", "BX000357.2", "BX005143.2", "BX119957.7", "BX248507.1", "BX248518.2", "BX511012.1", "BX908728.4", "BX927220.1", "C14orf156", "C14orf21", "CARHSP1", "CELF1", "CELF2", "CELF3", "CIRBP", "CNBP", "CNOT4", "CPEB1", "CPEB2", "CPEB3", "CPEB4", "CPSF4", "CPSF4L", "CPSF6", "CPSF7", "CR388219.1", "CR388372.1", "CR753328.1", "CR759778.5", "CR759782.5", "CR847863.1", "CSDA", "CSDC2", "CSDE1", "CSTF2", "CSTF2T", "DAZ1", "DAZ2", "DAZ3", "DAZ4", "DAZAP1", "DAZL", "DDX41", "DDX43", "DDX53", "DHX57", "DHX8", "DNAJC17", "DPPA5", "DUS3L", "EIF2S1", "EIF3B", "EIF3G", "EIF4B", "EIF4H", "ELAVL1", "ELAVL2", "ELAVL3", "ELAVL4", "ENOX1", "ENOX2", "ENSG00000180771", "ENSG00000183403", "ENSG00000185728", "ENSG00000213250", "ENSG00000215042", "ENSG00000215492", "ENSG00000231942", "ENSG00000248163", "ENSG00000249536", "ENSG00000249644", "ENSG00000250177", "ERAL1", "ESRP1", "ESRP2", "EWSR1", "FMR1", "FUBP1", "FUBP3", "FUS", "Fusip1", "FXR1", "FXR2", "G3BP1", "G3BP2", "GRSF1", "HDLBP", "HELZ", "HNRNPA0", "HNRNPA1", "HNRNPA1L2", "HNRNPA2B1", "HNRNPA3", "HNRNPAB", "HNRNPC", "HNRNPCL1", "HNRNPD", "HNRNPF", "HNRNPH1", "HNRNPH2", "HNRNPH3", "hnRNPK", "HNRNPL", "hnRNPLL", "HNRNPM", "HNRNPR", "HNRPDL", "HTATSF1", "IGF2BP1", "IGF2BP2", "IGF2BP3", "KHDRBS1", "KHDRBS2", "KHDRBS3", "KHSRP", "KIAA0430", "KRR1", "LARP1", "LARP1B", "LARP4", "LARP4B", "LARP6", "LARP7", "LENG9", "LIN28A", "LIN28B", "MATR3", "MBNL1", "MBNL2", "MBNL3", "MDM2", "MEX3B", "MEX3C", "MEX3D", "MKI67IP", "MKRN1", "MKRN2", "MKRN3", "MRPS28", "MSI1", "MSI2", "MTHFSD", "MYEF2", "NCBP2", "NCBP2L", "NCL", "NEIL3", "NOL8", "NONO", "NOVA1", "NOVA2", "NPLOC4", "NUP153", "NUPL2", "PABPC1", "PABPC1L", "PABPC1L2A", "PABPC1L2B", "PABPC3", "PABPC4", "PABPC5", "PABPN1", "PABPN1L", "PAN3", "PARP12", "PCBP1", "PCBP2", "PCBP3", "PCBP4", "PDCD11", "PEG10", "PNMA3", "PNO1", "PNPT1", "POLDIP3", "POLR2G", "PPARGC1A", "PPARGC1B", "PPIE", "PPIL4", "PPP1R10", "PPRC1", "PRR3", "PSPC1", "PTBP1", "PTBP2", "PUF60", "PUM1", "PUM2", "QKI", "RALY", "RANBP2", "RAVER1", "RAVER2", "RBBP6", "RBCK1", "RBFOX2", "RBFOX3", "RBM10", "RBM11", "RBM12", "RBM12B", "RBM14-RBM4", "RBM15", "RBM15B", "RBM17", "RBM18", "RBM19", "RBM22", "RBM23", "RBM24", "RBM25", "RBM26", "RBM27", "RBM28", "RBM3", "RBM33", "RBM34", "RBM38", "RBM39", "RBM4", "RBM41", "RBM42", "RBM44", "RBM45", "RBM46", "RBM47", "RBM4B", "RBM5", "RBM6", "RBM7", "RBM8A", "RBMS1", "RBMS2", "RBMS3", "RBMX", "RBMX2", "RBMXL1", "RBMXL2", "RBMXL3", "RBMY1A1", "RBMY1B", "RBMY1D", "RBMY1E", "RBMY1F", "RBMY1J", "RBPMS", "RBPMS2", "RBP_Name", "RC3H1", "RC3H2", "RCAN2", "RDBP", "RDM1", "RNF113A", "RNF113B", "RNF31", "RNPC3", "RNPS1", "ROD1", "RP11-1286E23.4", "RRP7A", "RYBP", "SAFB", "SAFB2", "SAMD4A", "SAMD4B", "SART3", "SCAF4", "SCAF8", "SETD1A", "SETD1B", "SF1", "SF3B4", "SFPQ", "SHARPIN", "SLTM", "SNRNP35", "SNRNP70", "SNRPA", "SNRPB2", "SOLH", "SPEN", "SRBD1", "SREK1", "SRRT", "SRSF1", "SRSF11", "SRSF12", "SRSF2", "SRSF3", "SRSF4", "SRSF5", "SRSF6", "SRSF7", "SRSF9", "SSB", "STAR-PAP", "SUPT6H", "SYNCRIP", "TAB2", "TAB3", "TAF15", "TARDBP", "TDRD10", "TDRKH", "TEX13A", "THOC4", "TIA1", "TIAL1", "TMEM63A", "TMEM63B", "TNRC6A", "TNRC6B", "TNRC6C", "TOE1", "TRA2A", "TRA2B", "TRMT1", "TRMT2A", "TRNAU1AP", "TTC14", "U2AF1", "U2AF1L4", "U2AF2", "U2SURP", "UHMK1", "UNK", "UNKL", "UPF3B", "YAF2", "YB-1", "YBX2", "YTHDC1", "YTHDC2", "YTHDF1", "YTHDF2", "ZC3H10", "ZC3H13", "ZC3H15", "ZC3H18", "ZC3H3", "ZC3H4", "ZC3H6", "ZC3H7A", "ZC3H7B", "ZC3H8", "ZCCHC11", "ZCCHC13", "ZCCHC14", "ZCCHC17", "ZCCHC2", "ZCCHC3", "ZCCHC5", "ZCCHC6", "ZCCHC7", "ZCCHC8", "ZCCHC9", "ZCRB1", "ZFP36", "ZFP36L1", "ZFP36L2", "ZGPAT", "ZMAT5", "ZNF638", "ZRANB1", "ZRANB2", "ZRANB3", "ZRSR1", "ZRSR2")

# ============================================
# 5. Extract all PWMs from background & filter to RBPs
# ============================================
#all_pwms <- bg_model@pwms  #2287
#rbp_pwms <- all_pwms[
#  vapply(names(all_pwms),
#         function(nm) any(nm %in% rbp_genes),
#         logical(1))
#]

# 如果你想做模糊匹配（基因名作为子串出现），可以用：
rbp_pwms <- all_pwms[grepl(paste(rbp_genes, collapse="|"), names(all_pwms), ignore.case=TRUE)]

cat("Number of RBP PWMs selected:", length(rbp_pwms), "\n")
cat("Example PWM names:\n", head(names(rbp_pwms), 10), "\n")

# ============================================
# 6. Create a custom background model with only RBP PWMs
# ============================================
custom_bg <- bg_model
custom_bg@pwms <- rbp_pwms

# ============================================
# 7. Run motif enrichment
# ============================================
enrichment_result <- motifEnrichment(utr_3_down_clean, custom_bg)

# ============================================
# 8. Generate report & convert to data.frame
# ============================================
report_list <- groupReport(enrichment_result)
report_df   <- as.data.frame(report_list)

# ============================================
# 9. Apply BH correction and filter significant hits
# ============================================
report_df$adj_pval <- p.adjust(report_df$p.value, method = "BH")
significant_hits <- subset(report_df, adj_pval < 0.05)

# ============================================
# 10. View results
# ============================================
# Display motif name, raw p-value, adjusted p-value, and enrichment score
colnames(significant_hits)[
match(c("target","id","raw.score"), colnames(significant_hits))
] <- c("Gene", "Motif_ID", "EnrichmentScore")

print(
head(
    significant_hits[, c("Gene","Motif_ID","EnrichmentScore","p.value","adj_pval")],
    6
)
)

p <- ggplot(significant_hits,
            aes(x = reorder(Motif_ID, adj_pval),
                y = -log10(adj_pval))) +
geom_col() +
coord_flip() +
labs(
    x = "Motif ID",
    y = expression(-log[10]("FDR‑adjusted p‑value")),
    title = "Significant RBP Motif Enrichment"
) +
theme_minimal()
ggsave(
filename = "RBP_motif_enrichment.png",  # or any path you like
plot     = p,
width    = 8,     # inches
height   = 6,     # inches
dpi      = 300    # resolution
)

Perform similar RBP-mapping by myself

# Load a set of RBP motifs from a database (e.g., RBPDB) RBPDB, ENCODE, JASPAR, or even UCL's RBP motifs.

# Set path to your PWM directory
pfm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PFMDir"

# List all .pfm files
pfm_files <- list.files(pfm_dir, pattern = "\\.pfm$", full.names = TRUE)

# Create an empty list to store PWMs
pfm_list <- list()

# Loop through each file
for (file in pfm_files) {
# Read table
pfm <- tryCatch({
    mat <- read.table(file, header = FALSE)
    mat <- as.matrix(mat)

    # Optionally add column names (assuming 4 columns for A, C, G, T)
    if (ncol(mat) == 4) {
    colnames(mat) <- c("A", "C", "G", "T")
    }

    mat
}, error = function(e) {
    message("Failed to read: ", file)
    NULL
})

# Use the filename (without extension) as list name
name <- tools::file_path_sans_ext(basename(file))
pfm_list[[name]] <- pfm
}

# Check one example
str(pfm_list[[1]])
#saveRDS(pfm_list, file = "compiled_pfm_list.rds")

#The error you're seeing (invalid PFM 'x': not an integer matrix) indicates that the PWM function expects the input matrix to represent raw position frequency matrices (PFMs), which are expected to be integer counts, but the matrices you have are likely log-odds matrices (floating point values), which represent the log-transformed scores instead of raw counts.

# convert to PFMatrix
pfm_named_list <- lapply(pfm_list, function(mat) {
mat <- as.matrix(mat)
if (!all(rownames(mat) %in% c("A", "C", "G", "T"))) {
    rownames(mat) <- c("A", "C", "G", "T")
}
PFMatrix(ID = "motif", name = "motif", profileMatrix = mat)
})

# Create an empty list to store results
match_results_list <- list()

# Iterate through each PFMatrix object in the list
for (i in seq_along(pfm_named_list)) {
# Get the current PFMatrix object
current_pfm <- pfm_named_list[[i]]

# Run matchMotifs for the current motif
match_result <- tryCatch({
    #matchMotifs(pfm_obj, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
    matchMotifs(current_pfm, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
}, error = function(e) {
    message("Error with motif ", names(pfm_named_list)[i], ": ", e$message)
    return(NULL)  # Return NULL if error occurs
})

# Store the result if no error
if (!is.null(match_result)) {
    match_results_list[[names(pfm_named_list)[i]]] <- match_result
}
}

# Check the results
match_results_list

# Extract motif match results for one motif
motif_matches_1004_8676391 <- assay(match_results_list[["1004_8676391"]], "motifMatches")

# Convert sparse matrix to dense matrix
dense_matrix <- as.matrix(motif_matches_1004_8676391)

# Optional: Add rownames for better visualization (if available)
if (!is.null(names(utr_3_down))) {
rownames(dense_matrix) <- names(utr_3_down)
}

# Convert logical matrix to numeric (1 for TRUE, 0 for FALSE)
dense_matrix_num <- matrix(as.numeric(dense_matrix),
                        nrow = nrow(dense_matrix),
                        dimnames = dimnames(dense_matrix))

# Now plot the heatmap
png("motif_match_1004_8676391.png", width = 800, height = 1600)
pheatmap(dense_matrix_num,
        cluster_rows = FALSE,
        cluster_cols = FALSE,
        color = c("grey", "steelblue"),
        legend_breaks = c(0, 1),
        legend_labels = c("No match", "Match"),
        main = "Motif Match: 1004_8676391")
dev.off()

#✅ Output
#You will see a binary heatmap, where:
#   * Blue cells = motif matched
#   * White cells = no match
#Let me know if you want to loop this over all motifs or save the plot to a file.

Does hg19 vs hg38 Position Matter in Motif Enrichment?

✅ If you're doing motif enrichment analysis only (like PWMEnrich):

Position DOES NOT matter.
Here's why:
Aspect  Explanation
🔍 PWMEnrich Only scans sequences for motif matches based on nucleotide content (e.g., enrichment of AU-rich motifs in downregulated 3′UTRs).
📏 Coordinates   Genomic coordinates (hg19 vs hg38) are not used in enrichment. Only the sequence itself matters.
⚖️ Statistical Test The enrichment is based on observed motif frequency vs background expectations (GC-aware or custom). No genome alignment is done.

So:

    ✅ As long as your 3′ UTR sequences are correct and biologically relevant, using an hg19 background is safe — even if your gene models are from hg38.

❗️ When does genome version matter?

If you're doing things like:
Use case    Position-sensitive?
🔬 Variant effect on motifs (e.g., SNPs in motifs)   ✅ Yes
🧭 Mapping motifs to genome positions (e.g., ChIP-seq overlaps)  ✅ Yes
📦 Loading sequences from coordinates (not FASTA)    ✅ Yes
🧠 Expression + genomic track integration (e.g., motif + ATAC-seq)   ✅ Yes

Then you must ensure genome versions match exactly.
✅ In Your Case: Motif Enrichment of Downregulated 3′UTRs

    If you are:

    Feeding real 3′UTR sequences (as DNAStringSet) for downregulated genes

    Using PWMEnrich purely for motif enrichment

Then:

    ✅ hg19 vs hg38 does not matter
    ❗ Only sequence content matters

Let me know if you want to later map enriched motifs back to genome positions — that’s when we’d switch to position-aware tools like FIMO, MOODS, or rtracklayer.

Custom RBP-database preparation (failed!)

# Example: Create a PWM for 2 fake motifs (replace with your actual PWMs)
#motif1 <- matrix(c(0.2, 0.3, 0.3, 0.2,
#                   0.1, 0.4, 0.4, 0.1,
#                   0.25, 0.25, 0.25, 0.25,
#                   0.3, 0.2, 0.3, 0.2), nrow = 4, byrow = TRUE,
#                 dimnames = list(c("A", "C", "G", "T"), NULL))
#motif2 <- matrix(c(0.3, 0.2, 0.2, 0.3,
#                   0.4, 0.1, 0.4, 0.1,
#                   0.25, 0.25, 0.25, 0.25,
#                   0.2, 0.3, 0.2, 0.3), nrow = 4, byrow = TRUE,
#                 dimnames = list(c("A", "C", "G", "T"), NULL))
#rbp_pwms <- list(RBP1 = motif1, RBP2 = motif2)

pwm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PWMDir"
pwm_files <- list.files(pwm_dir, pattern = "\\.pwm$", full.names = TRUE)
rbp_pwms <- list()
for (file in pwm_files) {
# Read table
pwm <- tryCatch({
    mat <- read.table(file, header = FALSE)
    mat <- as.matrix(mat)

    # Optionally add column names (assuming 4 columns for A, C, G, T)
    if (ncol(mat) == 4) {
    colnames(mat) <- c("A", "C", "G", "T")
    }

    mat
}, error = function(e) {
    message("Failed to read: ", file)
    NULL
})

# Use the filename (without extension) as list name
name <- tools::file_path_sans_ext(basename(file))
rbp_pwms[[name]] <- pwm
}

# -- filtering (ERROR) --
pwm_list <- motifs(bg_model)
# Example: filter motifs containing 'ELAVL' or 'HNRNP'
#rbp_pwms <- pwm_list[grep("ELAVL|HNRNP|IGF2BP|RBFOX|FMR1|PUM", names(pwm_list))]
rbp_enrichment <- motifEnrichment(utr_3_down, bg_model, pwmList = rbp_pwms)
rbp_report <- groupReport(rbp_enrichment)

# -- from MotifDb --
library(MotifDb)
human_motifs <- query(MotifDb, "Hsapiens")
rbp_motifs <- query(human_motifs, "RBP")

# ---- try Build your background model directly from these PWMs ----

# 1. List all PWM files
files_all <- list.files("~/Downloads/cisbp-rna/pwms_all_motifs", pattern="\\.txt$", full.names=TRUE)

# 2. Filter out empty files
file_sizes <- file.info(files_all)$size
files     <- files_all[file_sizes > 0]
cat("Reading", length(files), "non‑empty PWM files out of", length(files_all), "\n")

# 3. Define the reader (as before)
readCisbpPfm <- function(path) {
df <- read.table(path, header=TRUE, stringsAsFactors=FALSE)
mat_rna <- as.matrix(df[, c("A","C","G","U")])
mat_dna <- t(mat_rna)
rownames(mat_dna) <- c("A","C","G","T")
sweep(mat_dna, 2, colSums(mat_dna), FUN="/")
}

# 4. Read only the non‐empty files
cisbp_mats <- lapply(files, readCisbpPfm)
names(cisbp_mats) <- tools::file_path_sans_ext(basename(files))

# 5. (Optional) Check the first few
head(names(cisbp_mats))
length(cisbp_mats)  #193

# 6. Build your background model directly from these PWMs:
bg_utrs <- utr_3_background_clean    #readDNAStringSet("run2/3utr_background.fasta")
#bg_utrs <- bg_utrs[ width(bg_utrs)>=6 & !grepl("[^ACGT]", as.character(bg_utrs)) ]

custom_bg <- makeBackground(
seqs    = bg_utrs,
motifs = cisbp_mats,
type    = "logn",
organism = "hg19",
verbose  = TRUE
)

custom_bg <- makeBackground(
sequences = bg_utrs,       # your DNAStringSet of background UTRs
motifs    = cisbp_mats,    # your named list of PWM matrices
type      = "logn",        # log‑normal GC‑aware model
organism  = NULL,          # disable built‑in genomes
verbose   = TRUE
)

custom_bg <- makeBackground(
cisbp_mats,        # 1) motifs
organism = "hg19",
type    = "logn",  # 2) background model type
bg.seq  = bg_utrs, # 3) override organism by supplying your own sequences
quick   = FALSE    # (optional) fit to all sequences for best accuracy
)

# 1. Pick a library size to re‑scale your PWMs into PFMs
library_size <- 1000   # you can also try 1000

# 2. Convert each normalized matrix into integer counts
cisbp_pfms <- lapply(cisbp_mats, function(pmat) {
counts <- round(pmat * library_size)
# ensure no column sums to zero (add pseudocount 1 if needed)
zero_cols <- which(colSums(counts) == 0)
if (length(zero_cols) > 0) {
    counts[, zero_cols] <- 1
}
counts
})

# 3. Build a custom background (PFMs + your bg UTRs)
custom_bg <- makeBackground(
cisbp_pfms,      # 1) a list of integer PFMs
type   = "logn", # 2) log‑normal GC‑aware model
bg.seq = bg_utrs,# 3) your background DNAStringSet
quick  = FALSE   # fit on all sequences
)

# … then continue with motifEnrichment() on your down‑regulated UTRs as before.

# 5) Read & clean your down‑regulated 3′UTRs
down_utrs <- readDNAStringSet("downregulated_utrs.fasta")
down_utrs <- down_utrs[ width(down_utrs)>=6 & !grepl("[^ACGT]", as.character(down_utrs)) ]

# 6) Run the enrichment
res    <- motifEnrichment(down_utrs, custom_bg)
report <- groupReport(res)
df     <- as.data.frame(report)
df$adj_pval <- p.adjust(df$p.value, method="BH")

# 7) Subset significant
sig <- subset(df, adj_pval < 0.05)
print(sig[, c("motif.name","p.value","adj_pval","enrichmentScore")])

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Analysis of the RNA binding protein motifs for RNA Seq

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章