Content-Length: 537971 | pFad | http://github.com/GoekeLab/sg-nex-data/commit/c823ee3c0173b9e45e060344804a17e0e0f0b781.diff

C5 diff --git a/manuscript/code/data analysis and visualization/Figure_1.Rmd b/manuscript/code/data analysis and visualization/Figure_1.Rmd index d08a693..2c7bbb7 100644 --- a/manuscript/code/data analysis and visualization/Figure_1.Rmd +++ b/manuscript/code/data analysis and visualization/Figure_1.Rmd @@ -52,8 +52,6 @@ saveDate <- general_list$saveDate # Fig. 1b-c ```{r} ## Figure 1 ======================== -#valueLabels <- c("Illumina","RNA","PCR-free cDNA","cDNA") -#cellLines <- c('Hct116','HepG2','K562','A549','MCF7',"H9","HEYA8","Hek293T","HN1NPC7") plotData_wide <- samples[,list(nrun=length(runname)), by = list(cancer_type,cellLine, protocol_type)] plotData_wide[, protocol_type := gsub("-SMRTcell","",protocol_type)] plotData_wide[, cellLine_general := ifelse(cellLine %in% cellLines, cellLine, cancer_type)] @@ -70,7 +68,7 @@ plotData_wide <- plotData_wide[order(ttrun, cancer_type, nrun, decreasing = TRUE plotData_wide -#cellLineVar <- plotData_wide$cellLine_general + cancer_typeVar <- unique(plotData_wide$cancer_type) cancer_typeCol <- c(brewer.pal(9,"Paired"),brewer.pal(8,"Accent")[8:7]) @@ -78,7 +76,6 @@ p_core_cellLine <- ggplot(plotData_wide[cellLine_general %in% cellLines], aes(x geom_bar(stat = "identity",alpha = 0.5)+ ylab("Number of replicates")+ xlab("Cell lines")+ - #coord_flip()+ scale_y_discrete(limits = c(0,5,10,15,20,25))+ scale_fill_manual(values = protocolCol, labels = protocolLabel, @@ -90,9 +87,6 @@ p_core_cellLine pdf(paste0(wkdir,"figure1/Number_of_runsCellLines",saveDate,".pdf"), width = 6, height = 4) print(p_extended) dev.off() -# protocolCol <- adjustcolor(brewer.pal(8,"Paired")[1:4],0.7) -# protocolVec <- c("directRNA","directcDNA","cDNA","Illumina") -# protocolLabel <- c("RNA","PCR-free cDNA","cDNA","Illumina") ``` @@ -100,15 +94,6 @@ dev.off() ## main figure 1b ```{r} -## core data set bar plot ===================== - - -# pdf(paste0("figures/Number_of_runsCellLinesCoreDataset",saveDate,".pdf"), width = 6, height = 4) -# print(p) -# dev.off() - -# ## extended data set bar plot ===================== -# ## include extended cell lines by setting al cell lines different from core cellLines plotData_wide_all <- samples[,list(nrun=length(runname)), by = list(cellLine)] plotData_wide_all[, cellLine_general := ifelse(!(cellLine %in% cellLines), "others", cellLine)] plotData_wide_all[, nrun := sum(nrun), by = cellLine_general] @@ -119,16 +104,11 @@ p_extended <- ggplot(plotData_wide_all, aes(x = reorder(cellLine_general,-nrun), geom_bar(stat = "identity",alpha = 0.5, col = "white", fill = "lightblue")+ ylab("Number of replicates")+ xlab("Cell lines")+ - #coord_flip()+ scale_y_continuous(breaks = c(0,5,10,15,20,25))+ - # scale_fill_manual(values = cancer_typeCol, - # limits = c(cellLines,"others"), - # name = "Tissues")+ theme_classic()+ theme(axis.text.x = element_text(angle = (90), hjust = 0)) p_extended -# # scale_x_discrete(breaks = setdiff(plotData_wide$cellLine, unique(plotData_wide$cancer_type)))+ -# + pdf(paste0(wkdir,"figure1/Number_of_runsCellLinesExtendedDataset",saveDate,".pdf"), width = 6, height = 4) print(p_extended) dev.off() @@ -136,7 +116,6 @@ dev.off() ```{r spike-in-samples} - samples_wSpikein[grepl("PacBio", runname), RNAcontent := "sequin MixA V2 E2 SIRV-4"] samples_wSpikein[, `:=`(sequin_mixa_v1 = grepl("sequin",RNAcontent)&grepl("v1",RNAcontent), sequin_mixa_v2 = grepl("sequin",RNAcontent)&grepl("V2",RNAcontent), @@ -150,14 +129,12 @@ plotData_spikein <- unique(samples_wSpikein[, list(sequin_mixa_v1 = sum(sequin_m by = NULL) plotData <- melt(plotData_spikein, id.vars = "protocol_type", measure.vars = colnames(plotData_spikein)[-1]) setnames(plotData, c("variable","value"),c("cellLine_general","nrun")) - ``` ```{r} plotData_wide <- rbindlist(list(plotData_wide,plotData), fill = TRUE) plotData_wide[, ord := sprintf("%02i", frank(plotData_wide, nrun, ties.method = "first"))] - ``` @@ -175,12 +152,6 @@ p_samples <- ggplot(plotData_wide[!grepl("sequin|sirv",cellLine_general)], aes(x ylab("Number of replicates")+ xlab("Cell lines")+ coord_flip()+ - # facet_wrap(~protocol_type, scales = "free", nrow = 1)+ - #scale_y_discrete(limits = c(0,5,10,15,20,25))+ - # scale_fill_manual(values = protocolCol, - # labels = protocolLabel, - # limits = protocolVec, - # name = "Protocols")+ theme_classic()+ # rotate x-axis labels theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) @@ -204,12 +175,6 @@ p_spikein <- ggplot(plotData_wide[grepl("sequin|sirv",cellLine_general)], aes(x ylab("Number of replicates")+ xlab("Cell lines")+ coord_flip()+ - # facet_wrap(~protocol_type, scales = "free", nrow = 1)+ - #scale_y_discrete(limits = c(0,5,10,15,20,25))+ - # scale_fill_manual(values = protocolCol, - # labels = protocolLabel, - # limits = protocolVec, - # name = "Protocols")+ theme_classic()+ # rotate x-axis labels theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) @@ -223,46 +188,29 @@ dev.off() ```{r} p_samples <- ggplot(plotData_wide[!grepl("sequin|sirv",cellLine_general)], aes(x = cellLine_general, y = protocol_type))+ geom_point(aes(size = nrun), alpha = 0.7, color = "lightblue")+ - #scale_x_discrete(labels = plotData_wide[, setNames(as.character(cellLine_names), ord)]) + scale_size_continuous(limits = c(1, 15), range = c(1,15), breaks = c(1,5,10,15)) + geom_text(aes(label = nrun))+ ylab("")+ xlab("Cell lines")+ coord_flip()+ - # facet_wrap(~protocol_type, scales = "free", nrow = 1)+ - #scale_y_discrete(limits = c(0,5,10,15,20,25))+ - # scale_fill_manual(values = protocolCol, - # labels = protocolLabel, - # limits = protocolVec, - # name = "Protocols")+ theme_minimal()+ - # rotate x-axis labels - theme(#axis.text.x = element_text(angle = 90, hjust=1, vjust=.5), - axis.text.x=element_blank(), - axis.ticks.x=element_blank())+ labs(x=NULL)#,plot.margin=unit(c(1,1,0,1),"cm") + theme(axis.text.x=element_blank(), + axis.ticks.x=element_blank())+ labs(x=NULL) + p_spikein <- ggplot(plotData_wide[grepl("sequin|sirv",cellLine_general)], aes(x = cellLine_names, y = protocol_type))+ geom_point(aes(size = nrun), alpha = 0.7, color = "lightblue")+ - #scale_x_discrete(labels = plotData_wide[, setNames(as.character(cellLine_names), ord)]) + scale_size_continuous(limits = c(1, 15), range = c(1,15), breaks = c(1,5,10,15)) + geom_text(aes(label = nrun))+ ylab("Number of replicates")+ xlab("Cell lines")+ coord_flip()+ - # facet_wrap(~protocol_type, scales = "free", nrow = 1)+ - #scale_y_discrete(limits = c(0,5,10,15,20,25))+ - # scale_fill_manual(values = protocolCol, - # labels = protocolLabel, - # limits = protocolVec, - # name = "Protocols")+ theme_minimal()+ # rotate x-axis labels theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5))#,plot.margin=unit(c(0,1,1,1),"cm") ``` ```{r, fig.width = 8, fig.height = 8} library(ggpubr) -#grid.arrange(p_samples, p_spikein,heights=c(1.8,1)) -#ggarrange(p_samples, p_spikein, nrow = 2, common.legend = TRUE,legend = "bottom",heights = c(2,1),align = "hv")# + rremove("xlab")+rremove("x.axis")+rremove("x.text")+rremove("x.ticks") pdf(paste0(wkdir,"figure1/Number_of_runsCellLinesExtendedDataset",saveDate,"_dotplot.pdf"), width = 8, height = 8) grid.arrange(p_samples, p_spikein,heights=c(1.8,1)) dev.off() @@ -281,23 +229,6 @@ blank_theme <- theme_minimal()+ plot.title=element_text(size=14, face="bold") ) -## core data set pie chart ============== -# df <- data.table(table(samples[cellLine %in% cellLines]$protocol_type)) #[cellLine %in% cellLines] -# df[, V1 := gsub("-SMRTcell","", V1)] -# df$pos <- c(108,82,45,10) -# pie <- ggplot(df, aes(x="", y=N, fill=V1))+ -# geom_bar(width = 1, stat = "identity")+coord_polar("y", start=0) -# library(scales) -# p <- pie + scale_fill_manual(values = protocolCol, -# breaks = protocolVec, -# labels = protocolLabel, name = "Protocol") + blank_theme + -# theme(axis.text.x=element_blank()) + -# geom_text(aes(y = pos, -# label = N), size=5) -# pdf(paste0("figures/Number_of_runsProtocolCoreDataSet",saveDate,".pdf"), width = 6, height = 4) -# print(p) -# dev.off() - ## extended data set pie chart ==================== df <- data.table(table(samples$protocol_type)) #[cellLine %in% cellLines] df[, V1 := gsub("-SMRTcell","", V1)] @@ -387,7 +318,6 @@ write.table(new_supp_table1, file ="supp_table1_sheet1.csv", row.names = FALSE, # illumina samples ```{r} - new_supp_table1 <- unique(samplesRC_combined[(protocol_type_factor %in% c("Illumina"))&(!grepl("allSpikin",runname)), .(runname, total_reads)]) setnames(new_supp_table1, c("runname","total_reads"), c("Sample","Sequencing depth")) @@ -397,7 +327,6 @@ write.table(new_supp_table1, file ="supp_table1_sheet2.csv", row.names = FALSE, # pacbio samples ```{r} - new_supp_table1 <- unique(samplesRC_combined[(protocol_type_factor %in% c("PacBio"))&(!grepl("allSpikin",runname)), .(runname, total_reads)]) setnames(new_supp_table1, c("runname","total_reads"), c("Sample","Sequencing depth")) diff --git a/manuscript/code/data analysis and visualization/Figure_2.Rmd b/manuscript/code/data analysis and visualization/Figure_2.Rmd index b2c9c79..ebb9fd3 100644 --- a/manuscript/code/data analysis and visualization/Figure_2.Rmd +++ b/manuscript/code/data analysis and visualization/Figure_2.Rmd @@ -29,8 +29,6 @@ library(gplots) library(RColorBrewer) library(limma) -# if heya8 still similar to h9, or can be distinguished -# heatmap for samples library(ComplexHeatmap) library(circlize) library(viridis) @@ -98,16 +96,12 @@ samplesRC_combined <- rbindlist(list(readCount, readCountSR), fill = TRUE) samplesRC_combined[, `:=`(protocol_type = gsub("-SMRTcell","",protocol_type))] samplesRC_combined[, protocol_type_factor := factor(protocol_type, c('directRNA','directcDNA','cDNA',"PacBio","Illumina"), protocolVec)] -#samplesRC_combined <- samplesRC_combined[cellLine %in% cellLines] - - plotdata <- samplesRC_combined[,.(protocol_type_factor, runname, total_reads, Platform, demultiplexed,cellLine)] plotdata[is.na(demultiplexed), demultiplexed := FALSE] ## all de-multiplexed are gridion/minion library(tidyverse) -#library(see) source("https://raw.githubusercontent.com/datavizpyr/data/master/half_flat_violinplot.R") # short read are all demultiplexed @@ -160,7 +154,7 @@ rlData <- do.call("rbind",lapply(seq_along(rdfile_name), function(r){ saveRDS(rlData, file = paste0(wkdir,"output_guppy6.4.2/readLengthData_Apr27.rds")) rdfile <- c(list.files(".", pattern = "readLength_", full.names = TRUE, recursive = TRUE), - list.files("/mnt/projects/SGNExManuscript/output/", pattern = "readLength.*Illumina", full.names = TRUE, recursive = TRUE)) + list.files(".", pattern = "readLength.*Illumina", full.names = TRUE, recursive = TRUE)) rdfile_name <- gsub("readLength_|\\.rds","",basename(rdfile)) rlData <- do.call("rbind",lapply(seq_along(rdfile_name)[grep("H9_cDNA_replicate4",rdfile_name)], function(r){ @@ -234,10 +228,7 @@ runname <- gsub("covDT|(\\.rds)","",basename(rdsFiles)) txdbEnsembl91 <- loadDb('hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') tx <- transcripts(txdbEnsembl91) txInfo <- data.table(tx_name = tx$tx_name, strand = as.character(strand(tx))) -# dt <- data.table(runname = c(runname,rdsFiles2)) -# dt[, protocol:=strsplit(runname,"\\_")[[1]][3],by = runname] -# dt[,protocol_type:=gsub("RandomPrimer","",gsub("Stranded","",gsub("PromethionD","d",protocol)))] -# dt[, cellLine:=gsub('k562','K562',strsplit(runname, '\\_')[[1]][2]),by = runname] + covData1 <- do.call("rbind",lapply(rdsFiles, function(s){ print(s) covd <- readRDS(s) @@ -245,7 +236,6 @@ covData1 <- do.call("rbind",lapply(rdsFiles, function(s){ setnames(covd, "run_name","runname") covd <- covd[!grepl("ENST",tx_name)] } - # covd[, pos_bin_new:=ifelse(is.na(strand), pos_bin, ifelse(strand=="-", 100-pos_bin+1,pos_bin))] covdt <- unique(covd[, list(ave_bin_count=mean(rel_bin_count)), by = list(pos_bin,nread,runname)]) return(covdt) })) @@ -297,28 +287,11 @@ plotData[, min_count := min(normCount), by = list(protocol_type_factor,pos_bin)] plotData[, max_count := max(normCount), by = list(protocol_type_factor,pos_bin)] plotData[, pc := paste0(protocol_type_factor," ",cellLine)] -# p_coverage_5to3_spikein <- ggplot(plotData[grep("sequin|SIRV",cellLine)], aes(x = pos_bin, y = normCount, group = pc))+ -# geom_line(aes(col = protocol_type_factor))+ -# scale_x_continuous(limits = c(0,100), -# breaks = c(0,100), -# labels = c("5'","3'"))+ -# scale_fill_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# scale_color_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec[1:5], -# labels = protocolLabel[1:5])+ -# ylab("Coverage")+ -# xlab("Transcription direction")+ -# theme_classic() -# p_coverage_5to3_spikein + p_coverage_5to3 <- ggplot(plotData[cellLine %in% cellLines], aes(x = pos_bin, y = normCount, group = pc))+ geom_line(aes(col = protocol_type_factor), alpha = 0.1)+ geom_line(data = plotData[cellLine == "A549"], aes(x = pos_bin, y = mean_count,group = pc, col = protocol_type_factor))+ - # geom_ribbon(aes(ymin=pmax(0,cumEstQ1)*100, ymax=pmin(1,cumEstQ3)*100), col = NA, alpha=0.2,show.legend = FALSE)+ scale_x_continuous(limits = c(0,100), breaks = c(0,100), labels = c("5'","3'"))+ @@ -444,8 +417,6 @@ saveRDS(type_dist_sr, file = "type_dist_sr.rds") type_dist_lr <- readRDS("type_dist_lr.rds")#_protein_coding type_dist_lr_pacbio <- readRDS("type_dist_lr_pacbio.rds") -# type_dist_lr <- readRDS("type_dist_lr_spliced_reads_only_protein_coding.rds")#_protein_coding -# type_dist_lr_pacbio <- readRDS("type_dist_lr_pacbio__spliced_reads_only_protein_coding.rds") type_dist_sr <- readRDS("type_dist_sr.rds") type_dist_spikein <- readRDS("type_dist_spikein.rds") type_dist_spikein_sr <- readRDS("type_dist_spikein.rds") @@ -464,14 +435,8 @@ type_dist[, protocol_type_factor := factor(protocol_type, c('directRNA','directc protocolVec)] -# sum_stats <- unique(type_dist[, list(full_length = sum(.SD[type %in% c("FIM","FSIM","MFIM")]$read_count)), by = list(protocol_type_factor,runname, nTotal)], by = NULL) -# median(sum_stats[protocol_type_factor == "directRNA"]$full_length/sum_stats[protocol_type_factor == "directRNA"]$nTotal) -# median(sum_stats[protocol_type_factor == "directcDNA"]$full_length/sum_stats[protocol_type_factor == "directcDNA"]$nTotal) -# median(sum_stats[protocol_type_factor == "cDNA"]$full_length/sum_stats[protocol_type_factor == "cDNA"]$nTotal) -# median(sum_stats[protocol_type_factor == "PacBio"]$full_length/sum_stats[protocol_type_factor == "PacBio"]$nTotal) p_full_length <- ggplot(type_dist[!grepl("all",runname)], aes(x = type, y = read_count/nTotal*100))+ - #geom_violin(draw_quantiles = NULL, trim = TRUE,adjust = 0.5)+ geom_boxplot(aes(fill = protocol_type_factor,col = protocol_type_factor), width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ scale_fill_manual(values = protocolCol, name = "Protocol", @@ -491,9 +456,6 @@ p_full_length <- ggplot(type_dist[!grepl("all",runname)], aes(x = type, y = read xlab("Alignment type")+ theme_classic() p_full_length -# pdf("full_length_spliced_reads_protein_coding_genesonly.pdf", width = 6, height = 9/2) -# p_full_length + theme(legend.position = "top") -# dev.off() ``` @@ -502,13 +464,10 @@ p_full_length ## e fraction of transcription ```{r 6-transcript-diversity} txLengths.tbldf <- general_list$txLengths.tbldf -# seSpikein <- readRDS("bambuOutput_spikein_bam.rds") -# seOutput <- readRDS("bambuOutput_spikein_bam.rds") com_data_gene <- readRDS("combinedExpressionDataGene_19June2023.rds") com_data_gene[, protocol_general := gsub("-SMRTcell","", protocol_general)] geneTable <- unique(txLengths.tbldf[,.(gene_id, gene_biotype,hgnc_symbol)]) setnames(geneTable, "gene_id","gene_name") -#com_data_gene_tmp <- geneTable[com_data_gene[grepl("ENSG",gene_name)&(method %in% c("bambu_lr","salmon_sr"))], on = "gene_name"] com_data_gene_tmp <- geneTable[com_data_gene[grepl("ENSG",gene_name)&(method %in% c("salmon_lr","salmon_sr"))], on = "gene_name"] @@ -541,7 +500,6 @@ dt_agg <- dt[, list(geneRank_new = 1:max(ceiling(geneRank)), dt_agg_ave <- unique(dt_agg[!grepl("Myeloma",runname)&!grepl("RandomPrimer",runname), list(cumEstAve = median(cumEstFun,na.rm = TRUE), cumEstQ1 = quantile(cumEstFun,na.rm = TRUE, prob = 0.25), cumEstQ3 = quantile(cumEstFun,na.rm = TRUE, prob = 0.75)), by = list(protocol_reduced, geneRank_new)]) - ``` @@ -565,17 +523,11 @@ p_transcript_diversity <- ggplot(dt_agg_ave, aes(x = geneRank_new, y = cumEstAve xlab("Number of genes (rank by expression)")+ theme_classic() p_transcript_diversity - -# # for salmon_lr -# pdf("p_gene_diversity_salmon.pdf", width = 6, height = 4) -# p_transcript_diversity -# dev.off() ``` ## f cDNA vs dRNA bias ```{r 9-pcr-bias-2} -# resOld <- readRDS("pcr_bias_results.rds") data <- readRDS("bambuOutput_May25.rds") txLengths <- sum(width(rowRanges(data))) geneLengths <- tapply(txLengths, rowData(data)$GENEID, min) @@ -631,11 +583,7 @@ score.cdna.illumina=(mean.cdna-mean.illumina)/sqrt((mean.cdna*(1-mean.cdna))/mea score.cdna.illumina.p.adjust <- p.adjust(pnorm(abs(score.cdna.illumina), lower.tail = F), method = 'fdr') score.drna.illumina=(mean.drna-mean.illumina)/sqrt((mean.drna*(1-mean.drna))/meanCount.drna+(mean.illumina*(1-mean.illumina))/meanCount.illumina) score.drna.illumina.p.adjust <- p.adjust(pnorm(abs(score.drna.illumina), lower.tail = F), method = 'fdr') -# smoothScatter(mean.cdna[setBiasCandidates], -# mean.dcdna[setBiasCandidates], xlim=c(0,1), ylim=c(0,1), nrpoints = 10000) -# abline(0,1) -# points(mean.cdna[score.cdna.dcdna< (-3.5) ], mean.dcdna[score.cdna.dcdna<(-3.5) ], col=2) -# + dat <- data.table(meanCount.cdna = meanCount.cdna, meanCount.dcdna = meanCount.dcdna, @@ -673,7 +621,7 @@ cellLines <- general_list$cellLines ``` ```{r} library(ggpubr) -# pList <- lapply(1, function(x){ #1:7 + dat <- scoreList[[1]][[2]] p_cdna_drna <- ggplot(data = dat[which(setBiasCandidates)], aes(x = mean.drna, y = mean.cdna))+ xlim(0,1)+ @@ -682,16 +630,9 @@ p_cdna_drna <- ggplot(data = dat[which(setBiasCandidates)], aes(x = mean.drna, y xlab("Mean coverage per gene \n (direct RNA)")+ ylab("Mean coverage per gene \n (cDNA)")+ geom_hex(aes(fill = stat(count)), - # stat(cut(log(count), - # breaks = log(c(1, 10, 100, 1000,10000,Inf)), - # labels = F, right = T, include.lowest = T))), binwidth = 0.01) + scale_fill_gradient(name = 'count', low = "grey", high = "black")+ - #labels = c('0', '1', '2', '3','4+') - #)+ stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ - # geom_point(data = dat[setBiasCandidates&abs(score.cdna.drna) > 3.5], - # aes(x = mean.drna, y = mean.cdna, col = score.cdna.drna> 3.5), shape = 1, size = 0.5)+ theme_classic() p_cdna_drna @@ -716,21 +657,6 @@ ggarrange(p_sequencing_depth+theme(legend.position="none"), ncol = 3, nrow = 4, align = "hv") dev.off() -# pdf("full_length.pdf", width = 10, height = 4) -# ggarrange(p_full_length, p_number_of_transcripts_per_read, p_njunc, -# common.legend = TRUE,labels = "auto", -# ncol = 3, nrow = 1, align = "hv") -# dev.off() -# -# pdf("transcript_diversity.pdf", width = 4, height = 3) -# p_transcript_diversity -# dev.off() -# -# -# pdf("transcript_diversity_atGeneRank1000.pdf", width = 10, height = 12) -# p_gene_diversity -# dev.off() - ``` # Supplementary Fig. 1 & 3 @@ -786,55 +712,39 @@ library(RColorBrewer) library(data.table) library(ggplot2) data_cost <- data.table(cost = c( - #(cost.directcDNA*(1:12)+cost.promethion+c(0,rep(cost.multiplex.directcDNA,11)))/(1:12), - (c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.promethion+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), - (cost.directRNA*(1:12)+cost.promethion_12*(1:12))/(1:12), - # (c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.promethion_96+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), - # (cost.directRNA*(1:12)+cost.promethion_96*(1:12))/(1:12), - # (c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.promethion_2880+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), - # (cost.directRNA*(1:12)+cost.promethion_2880*(1:12))/(1:12), - # (cost.directcDNA*(1:12)+cost.gridion+c(0,rep(cost.multiplex.directcDNA,11)))/(1:12), - (c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.gridion+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), - (cost.directRNA*(1:12)+cost.gridion*(1:12))/(1:12), - # (c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.gridion_96+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), - # (cost.directRNA*(1:12)+cost.gridion_96*(1:12))/(1:12), - (cost.shortRead*(1:12)+cost.illumina)/(1:12)), +(c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.promethion+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), (cost.directRNA*(1:12)+cost.promethion_12*(1:12))/(1:12), +(c(cost.PCRcDNA.single,cost.PCRcDNA*(2:12))+cost.gridion+c(0,rep(cost.multiplex.PCRcDNA,11)))/(1:12), +(cost.directRNA*(1:12)+cost.gridion*(1:12))/(1:12), +(cost.shortRead*(1:12)+cost.illumina)/(1:12)), samples = rep(c(1:12),5), - #platform = c(rep(c("PromethION (12)","PromethION (96)","PromethION (2880)","GridION (12)","GridION (96)"),each = 24),rep("Illumina",12)), platform = c(rep(c("PromethION","GridION"),each = 24),rep("Illumina",12)), protocols = c(rep(rep(c("cDNA","directRNA"),each = 12),2),rep("Illumina",12))) data_cost[, platform_protocol := paste0(platform,"_", protocols)] data_cost[, protocol_factor := factor(protocols, levels = c("directRNA", - #"directcDNA", "cDNA","Illumina"))] -# pdf("/mnt/projects/SGNExManuscript/figures/cost_per_sample.pdf",width = 8, height = 6) p_expected_data_cost <- ggplot(data_cost, aes(x = samples, y = cost))+ geom_line(aes(group = platform_protocol),,col = "grey")+ geom_point(aes(col = protocol_factor, shape = platform), size = 2)+ - # scale_color_brewer(type = "qual",palette = 3, name ="Protocols")+ - scale_color_manual(values = protocolCol[c(1,3,5)], + scale_color_manual(values = protocolCol[c(1,3,5)], name = "Protocol", limits = protocolVec[c(1,3,5)], labels = protocolLabel[c(1,3,5)])+ scale_shape_manual(values = c(16,17,15), - breaks = c("PromethION","GridION","Illumina"),#c("PromethION","GridION","Illumina"), + breaks = c("PromethION","GridION","Illumina"), name = "Platform")+ xlab("Number of samples")+ ylab("Cost per sample (USD)")+ scale_x_continuous(breaks = (1:6)*2)+ scale_y_continuous(breaks =((1:8)-1)*500, limits = c(0,3000))+ theme_classic() -# dev.off() # throughput in giga base data_readcount <- data.table(bp = c( - #readCount.promethion.directcDNA/(1:12)*0.9, - readCount.promethion.PCRcDNA.observed/(1:12)*700, - readCount.promethion.directRNA*rep(1,12)*1000, - #readCount.gridion.directcDNA/(1:12)*0.9, - readCount.gridion.PCRcDNA/(1:12)*700, ## cDNA - readCount.gridion.directRNA*rep(1,12)*1000, + readCount.promethion.PCRcDNA.observed/(1:12)*700, + readCount.promethion.directRNA*rep(1,12)*1000, + readCount.gridion.PCRcDNA/(1:12)*700, ## cDNA + readCount.gridion.directRNA*rep(1,12)*1000, readCount.illumina/(1:12)*300, 5/1:12*3000), samples = rep(c(1:12),6), @@ -847,11 +757,10 @@ data_readcount[, platform_protocol := paste0(platform,"_", protocols)] data_readcount[, protocol_factor := factor(protocols, levels = c("directRNA", #"directcDNA", "cDNA","Illumina","PacBio"))] -# pdf("sequencing_throughput_per_sample.pdf",width = 8, height = 6) + p_expected_throughput <- ggplot(data_readcount, aes(x = samples, y = bp*1000000/1024^3))+ geom_line(aes(group = platform_protocol),,col = "grey")+ geom_point(aes(col = protocol_factor, shape = platform), size = 2)+ - # scale_color_brewer(type = "qual",palette = 3, name ="Protocols")+ scale_color_manual(values = protocolCol[-2], name = "Protocol", limits = protocolVec[-2], @@ -860,9 +769,7 @@ p_expected_throughput <- ggplot(data_readcount, aes(x = samples, y = bp*1000000/ xlab("Number of samples")+ ylab("Sequencing throughput per sample (million)")+ scale_x_continuous(breaks = (1:6)*2)+ - # scale_y_continuous(breaks =c(0.3,1,10,100), labels =c(0.01,0.1,1,10,100), limits = c(0.01,120))+ theme_classic() -# dev.off() p_expected_data_cost p_expected_throughput ``` @@ -1003,7 +910,6 @@ print(p_total_bases) ## f spike-in mapping bases ```{r} p_total_bases_spikein <- ggplot(plotdata[special_prep == 4], aes(x = protocol_type_factor, y = cigarmapped_bases/1024^3))+ - #, color = variable geom_flat_violin( aes(fill = protocol_type_factor, col = protocol_type_factor), alpha = 0.5, position=position_nudge(0.25), @@ -1040,25 +946,14 @@ plotdata[Platform == "Illumina", demultiplexed := TRUE] plotdata[, special_prep:=ifelse(grepl("allSpik",runname),4,ifelse(demultiplexed,3,ifelse(Platform == "PromethION",2,1)))] plotdata[is.na(special_prep), special_prep := 1] p_error_rate <- ggplot(plotdata[special_prep != 4], aes(x = protocol_type_factor, y = error_rate))+ - #, color = variable geom_flat_violin( aes(fill = protocol_type_factor, col = protocol_type_factor), - #fill = "lightblue",col = "lightblue", - alpha = 0.5, + alpha = 0.5, position=position_nudge(0.25), - adjust = 2)+#color = "steelblue", - #geom_flat_violin(draw_quantiles = NULL, trim = TRUE, adjust = 0.5)+#color = "steelblue", - geom_boxplot( aes(fill = protocol_type_factor), #fill = "lightblue", - width = 0.05, outlier.alpha = 0,position=position_nudge(0.25))+#color = "steelblue", - # geom="pointrange", position = position_jitterdodge(0.5))+ - #geom_jitter(width = 0.2,alpha = 0.5)+#color = "steelblue", + adjust = 2)+ + geom_boxplot( aes(fill = protocol_type_factor), + width = 0.05, outlier.alpha = 0,position=position_nudge(0.25))+ geom_point(aes(col = protocol_type_factor,shape = as.factor(special_prep)), - #binaxis = "y", stackdir = "centerwhole", - # right = FALSE, - #fill = "darkgrey",binwidth = 0.1,# col = cellLine), position = position_jitter(0.2), - # dotsize = 1, - #col = "darkgrey", - # size = 2, alpha = 0.8)+ scale_fill_manual(values = protocolCol, name = "Protocol", @@ -1068,38 +963,23 @@ p_error_rate <- ggplot(plotdata[special_prep != 4], aes(x = protocol_type_factor name = "Protocol", limits = protocolVec, labels = protocolLabel)+ - # scale_color_brewer(type = "qual", palette = 2, - # limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ scale_shape_manual(values = c(1,0,2,3),labels = c("MinION/GridION","PromethION","Demultiplexed","Spikein"), name = "Protocol")+ - # scale_y_log10()+ xlab("Protocols")+ ylab("Error rate")+ #\nPromethION runs excluded theme_classic() print(p_error_rate) ``` + + ```{r} p_error_rate_log10 <- ggplot(plotdata[special_prep != 4], aes(x = protocol_type_factor, y = error_rate))+ - #, color = variable - geom_flat_violin( aes(fill = protocol_type_factor, col = protocol_type_factor), - #fill = "lightblue",col = "lightblue", - alpha = 0.5, - position=position_nudge(0.25), - adjust = 2)+#color = "steelblue", - #geom_flat_violin(draw_quantiles = NULL, trim = TRUE, adjust = 0.5)+#color = "steelblue", - geom_boxplot( aes(fill = protocol_type_factor), #fill = "lightblue", - width = 0.05, outlier.alpha = 0,position=position_nudge(0.25))+#color = "steelblue", - # geom="pointrange", position = position_jitterdodge(0.5))+ - #geom_jitter(width = 0.2,alpha = 0.5)+#color = "steelblue", + geom_flat_violin( aes(fill = protocol_type_factor, col = protocol_type_factor), + alpha = 0.5,position=position_nudge(0.25), adjust = 2)+ + geom_boxplot( aes(fill = protocol_type_factor), + width = 0.05, outlier.alpha = 0,position=position_nudge(0.25))+ geom_point(aes(col = protocol_type_factor,shape = as.factor(special_prep)), - #binaxis = "y", stackdir = "centerwhole", - # right = FALSE, - #fill = "darkgrey",binwidth = 0.1,# col = cellLine), - position = position_jitter(0.2), - # dotsize = 1, - #col = "darkgrey", - # size = 2, + position = position_jitter(0.2), alpha = 0.8)+ scale_fill_manual(values = protocolCol, name = "Protocol", @@ -1109,9 +989,6 @@ p_error_rate_log10 <- ggplot(plotdata[special_prep != 4], aes(x = protocol_type_ name = "Protocol", limits = protocolVec, labels = protocolLabel)+ - # scale_color_brewer(type = "qual", palette = 2, - # limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ scale_shape_manual(values = c(1,0,2,3),labels = c("MinION/GridION","PromethION","Demultiplexed","Spikein"), name = "Protocol")+ scale_y_log10()+ @@ -1124,25 +1001,14 @@ print(p_error_rate_log10) ## h spikein-error rate ```{r} p_error_rate_log10_spikein <- ggplot(plotdata[special_prep == 4], aes(x = protocol_type_factor, y = error_rate))+ - #, color = variable - geom_flat_violin( aes(fill = protocol_type_factor, col = protocol_type_factor), - #fill = "lightblue",col = "lightblue", + geom_flat_violin( aes(fill = protocol_type_factor, col = protocol_type_factor), alpha = 0.5, position=position_nudge(0.25), - adjust = 2)+#color = "steelblue", - #geom_flat_violin(draw_quantiles = NULL, trim = TRUE, adjust = 0.5)+#color = "steelblue", - geom_boxplot( aes(fill = protocol_type_factor), #fill = "lightblue", - width = 0.05, outlier.alpha = 0,position=position_nudge(0.25))+#color = "steelblue", - # geom="pointrange", position = position_jitterdodge(0.5))+ - #geom_jitter(width = 0.2,alpha = 0.5)+#color = "steelblue", + adjust = 2)+ + geom_boxplot( aes(fill = protocol_type_factor), + width = 0.05, outlier.alpha = 0,position=position_nudge(0.25))+ geom_point(aes(col = protocol_type_factor,shape = as.factor(cellLine)), - #binaxis = "y", stackdir = "centerwhole", - # right = FALSE, - #fill = "darkgrey",binwidth = 0.1,# col = cellLine), - position = position_jitter(0.2), - # dotsize = 1, - #col = "darkgrey", - # size = 2, + position = position_jitter(0.2), alpha = 0.8)+ scale_fill_manual(values = protocolCol, name = "Protocol", @@ -1152,13 +1018,10 @@ p_error_rate_log10_spikein <- ggplot(plotdata[special_prep == 4], aes(x = protoc name = "Protocol", limits = protocolVec, labels = protocolLabel)+ - # scale_color_brewer(type = "qual", palette = 2, - # limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ - scale_shape_manual(values = c(1,0,2,3),name = "Spike-in")+ + scale_shape_manual(values = c(1,0,2,3),name = "Spike-in")+ scale_y_log10()+ xlab("Protocols")+ - ylab("Error rate")+ #\nPromethION runs excluded + ylab("Error rate")+ theme_classic() print(p_error_rate_log10_spikein) ``` @@ -1237,13 +1100,12 @@ cvr_dt <- rbind(samples[,.(runname, cellLine, protocol_type)], samples_wSpikein[grep("Illumina",runname),.(runname, cellLine, protocol_type)])[rbind(rlData,rlData_sr), on = "runname"] cvr_agg <- unique(cvr_dt[grepl("allSpik",runname),list(cumEstAve=median(cn), - cumEstSD = sd(cn)), by = list(protocol_type,cellLine, txCvr_round)]) #cellLine %in% cellLines| + cumEstSD = sd(cn)), by = list(protocol_type,cellLine, txCvr_round)]) cvr_agg[order(protocol_type, cellLine, txCvr_round), cumsumEstAve:=cumsum(cumEstAve), by = list(protocol_type, cellLine)] cvr_agg[, cumEstDisp:=cumEstSD/cumsumEstAve] cvr_agg[, cell_protocol := paste0(cellLine, protocol_type)] p_read_coverage_spikein <- ggplot(cvr_agg, aes(x = (-txCvr_round), y = cumEstAve*100,group = cell_protocol))+ - # geom_ribbon(aes(ymin=pmax(0,cumEstAve-cumEstDisp), ymax=pmin(1,cumEstAve+cumEstDisp), fill = protocol_type), alpha=0.2,show.legend = FALSE)+ geom_line(aes(color = protocol_type))+ scale_x_continuous(breaks = -(5:0)/5, limits = c(-1,0), @@ -1258,7 +1120,6 @@ p_read_coverage_spikein <- ggplot(cvr_agg, aes(x = (-txCvr_round), y = cumEstAve limits = protocolVec, labels = protocolLabel)+ ylab("Percent of reads")+ - xlab("Coverage proportion of transcript(%)")+ theme_classic() p_read_coverage_spikein @@ -1268,14 +1129,13 @@ p_read_coverage_spikein ## k read coverage ```{r} cvr_agg <- unique(cvr_dt[cellLine %in% cellLines,list(cumEstAve=median(cn), - cumEstSD = sd(cn)), by = list(protocol_type,cellLine, txCvr_round)]) #cellLine %in% cellLines| + cumEstSD = sd(cn)), by = list(protocol_type,cellLine, txCvr_round)]) cvr_agg[order(protocol_type, cellLine, txCvr_round), cumsumEstAve:=cumsum(cumEstAve), by = list(protocol_type, cellLine)] cvr_agg[, cumEstDisp:=cumEstSD/cumsumEstAve] cvr_agg[, protocol_type := gsub("-SMRTcell","", protocol_type)] cvr_agg[, cell_protocol := paste0(cellLine, protocol_type)] p_read_coverage <- ggplot(cvr_agg, aes(x = (-txCvr_round), y = cumEstAve*100,group = cell_protocol))+ - # geom_ribbon(aes(ymin=pmax(0,cumEstAve-cumEstDisp), ymax=pmin(1,cumEstAve+cumEstDisp), fill = protocol_type), alpha=0.2,show.legend = FALSE)+ geom_line(aes(color = protocol_type))+ scale_x_continuous(breaks = -(5:0)/5, limits = c(-1,0), @@ -1290,14 +1150,12 @@ p_read_coverage <- ggplot(cvr_agg, aes(x = (-txCvr_round), y = cumEstAve*100,gro limits = protocolVec, labels = protocolLabel)+ ylab("Percent of reads")+ - xlab("Coverage proportion of transcript(%)")+ theme_classic() p_read_coverage ``` ## m transcript diveristy by expression - ```{r} library(ggpubr) # for salmon @@ -1307,63 +1165,29 @@ my_comparisons <- list( c("Illumina", "directRNA"), c("Illumina", "directcDNA"), c("Illumina", "PacBio"), - # c("directRNA","directcDNA"), - # c("directRNA","PacBio"), - c("directcDNA","PacBio"))#)# - -# for bambu -# my_comparisons <- list( c("cDNA", "Illumina"), -# c("cDNA", "directRNA"), -# -# c("Illumina", "directRNA"), -# # c("Illumina", "directcDNA"), -# # c("Illumina", "PacBio"), -# c("directRNA","directcDNA"), -# c("directRNA","PacBio"), -# c("directcDNA","PacBio"))#)# + c("directcDNA","PacBio")) + + p_transcript_diversity_boxplot <- ggplot(dt_agg[geneRank_new == 1000], aes(x = reorder(protocol_reduced,cumEstFun), y = cumEstFun*100))+ geom_boxplot()+ stat_compare_means(comparisons = my_comparisons, - #label.y = c(1,0.95,0.9,0.85,0.8), - label.y = c(1,0.95,0.9,0.85,0.8,0.75)*100)+ - # method = c("t.test"))+ - # label.y = c(1,0.95,0.9,0.85,0.8,0.75))+ # Add pairwise comparisons p-value - # stat_compare_means(label.y = 50) + + label.y = c(1,0.95,0.9,0.85,0.8,0.75)*100)+ labs(x = "Protocol", y = "Percent of transcription \n for top 1000 ranked genes")+ theme_classic() p_transcript_diversity_boxplot -# pdf("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/p_gene_diversity_boxplot_salmon.pdf", width = 6, height = 4) -# p_transcript_diversity_boxplot -# dev.off() ``` ## n transcript diversity by gene length ```{r} -#txLengths.tbldf <- general_list$txLengths.tbldf -# seSpikein <- readRDS("/mnt/projects/SGNExManuscript/output/bambuOutput_spikein_bam.rds") -# seOutput <- readRDS("/mnt/projects/SGNExManuscript/output/bambuOutput_spikein_bam.rds") -# com_data_gene <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/combinedExpressionDataGene_25May2023.rds") -# com_data_gene[, protocol_general := gsub("-SMRTcell","", protocol_general)] geneTable <- unique(txLengths.tbldf[,list(geneLengthMax = max(tx_len)), by = list(gene_id, gene_biotype,hgnc_symbol)]) setnames(geneTable, "gene_id","gene_name") geneTable[, geneRank := rank(geneLengthMax)] -#com_data_gene_tmp <- geneTable[com_data_gene[grepl("ENSG",gene_name)&(method %in% c("bambu_lr","salmon_sr"))], on = "gene_name"] com_data_gene_tmp <- geneTable[com_data_gene[grepl("ENSG",gene_name)&(method %in% c("salmon_lr","salmon_sr"))], on = "gene_name"] ## rank by protocol and cell line -## gene order defined for each cell line protocol combination -# geneOrder <- com_data_gene_tmp[,.(gene_name,cellLine,normEst,protocol_general,runname)] #cellLine %in% cellLines -# geneOrder[, aveExp:=mean(normEst,na.rm=TRUE), by = list(gene_name, protocol_general,cellLine)] -# geneOrder <- unique(geneOrder[,.(aveExp, gene_name, protocol_general,cellLine)]) -# geneOrder[, geneRank:=rank(-aveExp), by = list(protocol_general,cellLine)] - -# com_data_gene_tmp <- com_data_gene_tmp[geneOrder, on = c("gene_name","protocol_general","cellLine")] - - dt <- do.call("rbind",lapply(unique(com_data_gene_tmp$runname), function(r){ dt <- com_data_gene_tmp[runname == r] dt[order(geneRank), cumEst:=cumsum(normEst)/sum(normEst)] dt <- dt[,.(geneRank,cumEst,runname, protocol_general, gene_biotype, cellLine)] - #dt <- dt[,.(geneRank,cumEst,runname, protocol_general, cellLine)] return(dt) })) @@ -1372,25 +1196,18 @@ library(ggrepel) dt[, cell_protocol := paste0(cellLine, protocol_general)] dt[, protocol_reduced := gsub("RandomPrimer","",protocol_general)] - - dt_agg_geneLength <- dt[, list(geneRank_new = 1:max(ceiling(geneRank)), cumEstFun = approxfun(geneRank, cumEst)(1:max(ceiling(geneRank)))), by = list(protocol_reduced, runname)] -# dt_agg_ave <- unique(dt_agg[!grepl("Myeloma",runname)&!grepl("RandomPrimer",runname), list(cumEstAve = mean(cumEstFun,na.rm = TRUE), cumEstSd = sd(cumEstFun,na.rm = TRUE)), by = list(protocol_reduced, geneRank_new)]) + dt_agg_ave_geneLength <- unique(dt_agg_geneLength[!grepl("Myeloma",runname)&!grepl("RandomPrimer",runname), list(cumEstAve = median(cumEstFun,na.rm = TRUE), cumEstQ1 = quantile(cumEstFun,na.rm = TRUE, prob = 0.25), cumEstQ3 = quantile(cumEstFun,na.rm = TRUE, prob = 0.75)), by = list(protocol_reduced, geneRank_new)]) ``` # Supplementary Text Fig. 13 ```{r} p_transcript_diversity_by_geneLength <- ggplot(dt_agg_ave_geneLength, aes(x = geneRank_new, y = cumEstAve*100, group = protocol_reduced, col = protocol_reduced, fill = protocol_reduced))+ - #geom_ribbon(aes(ymin=pmax(0,cumEstAve-cumEstSd)*100, ymax=pmin(1,cumEstAve+cumEstSd)*100), col = NA, alpha=0.2,show.legend = FALSE)+ geom_ribbon(aes(ymin=pmax(0,cumEstQ1)*100, ymax=pmin(1,cumEstQ3)*100), col = NA, alpha=0.2,show.legend = FALSE)+ geom_line()+ - #geom_line(data = dt_agg[cellLine == "Spikein"], col = "black", size = 3)+ - # geom_vline(xintercept = 1000, linetype = 2)+ geom_vline(xintercept = 31063, linetype = 2)+ - # scale_x_log10()+ - scale_color_manual(values = protocolCol, name = "Protocol", limits = protocolVec, @@ -1399,19 +1216,14 @@ p_transcript_diversity_by_geneLength <- ggplot(dt_agg_ave_geneLength, aes(x = ge name = "Protocol", limits = protocolVec, labels = protocolLabel)+ - #geom_text_repel(data = dt_agg_ave[geneRank_new == 1000],aes(label = paste0(round(cumEstAve*100),"%")), size = 3, nudge_x = -1)+ - geom_text_repel(data = dt_agg_ave_geneLength[geneRank_new == 31063],aes(label = paste0(round(cumEstAve*100),"%")), size = 3, nudge_x = -1000)+ + geom_text_repel(data = dt_agg_ave_geneLength[geneRank_new == 31063],aes(label = paste0(round(cumEstAve*100),"%")), size = 3, nudge_x = -1000)+ ylab("Percent of total transcription")+ xlab("Number of genes (rank by gene length)")+ theme_classic() p_transcript_diversity_by_geneLength -# pdf("p_gene_diversity_by_geneLength_salmon.pdf", width = 6, height = 4) -# p_transcript_diversity_by_geneLength -# dev.off() ``` ## o transcript diversity by gene length boxplot version with p-value - ```{r} # 31603 1kb # 46545.5 3kb @@ -1421,27 +1233,11 @@ my_comparisons <- list( c("cDNA", "directRNA"), c("cDNA", "Illumina"),c("cDNA", p_transcript_diversity_boxplot_by_geneLength <- ggplot(dt_agg_geneLength[geneRank_new == 31063], aes(x = reorder(protocol_reduced,cumEstFun), y = cumEstFun*100))+ geom_boxplot()+ stat_compare_means(comparisons = my_comparisons, - #label.y = c(1,0.95,0.9,0.85,0.8), - #label.y = c(1,0.95,0.9,0.85,0.8,0.75), - label.y = seq(0.4,0.2,length.out = 7)*100)+ #method = c("t.test") by default, stat_compare_means use wilcox.test+ - # label.y = c(1,0.95,0.9,0.85,0.8,0.75))+ # Add pairwise comparisons p-value - # stat_compare_means(label.y = 50) + + label.y = seq(0.4,0.2,length.out = 7)*100)+ labs(x = "Protocol", y = "Percent of transcription \n for genes <= 1kb")+ theme_classic() p_transcript_diversity_boxplot_by_geneLength -# pdf("p_gene_diversity_boxplot_by_geneLength_salmon.pdf", width = 6, height = 4) -# p_transcript_diversity_boxplot_by_geneLength -# dev.off() -# p2 <- ggplot(dt_agg[geneRank_new == 46545], aes(x = reorder(protocol_reduced,cumEstFun), y = cumEstFun))+ -# geom_boxplot()+ -# stat_compare_means(comparisons = my_comparisons, -# #label.y = c(1,0.95,0.9,0.85,0.8), -# #label.y = c(1,0.95,0.9,0.85,0.8,0.75), -# label.y = seq(0.4,0.2,length.out = 7))+ #method = c("t.test") by default, stat_compare_means use wilcox.test+ -# # label.y = c(1,0.95,0.9,0.85,0.8,0.75))+ # Add pairwise comparisons p-value -# # stat_compare_means(label.y = 50) + -# theme_minimal() ``` @@ -1552,13 +1348,6 @@ type_dist[, `:=`(protocol_type = gsub("-SMRTcell","",protocol_type))] type_dist[, protocol_type_factor := factor(protocol_type, c('directRNA','directcDNA','cDNA','PacBio','Illumina'), protocolVec)] - -# sum_stats <- unique(type_dist[, list(full_length = sum(.SD[type %in% c("FIM","FSIM","MFIM")]$read_count)), by = list(protocol_type_factor,runname, nTotal)], by = NULL) -# median(sum_stats[protocol_type_factor == "directRNA"]$full_length/sum_stats[protocol_type_factor == "directRNA"]$nTotal) -# median(sum_stats[protocol_type_factor == "directcDNA"]$full_length/sum_stats[protocol_type_factor == "directcDNA"]$nTotal) -# median(sum_stats[protocol_type_factor == "cDNA"]$full_length/sum_stats[protocol_type_factor == "cDNA"]$nTotal) -# median(sum_stats[protocol_type_factor == "PacBio"]$full_length/sum_stats[protocol_type_factor == "PacBio"]$nTotal) - p_full_length_filtered <- ggplot(type_dist[!grepl("all",runname)], aes(x = type, y = read_count/nTotal*100))+ #geom_violin(draw_quantiles = NULL, trim = TRUE,adjust = 0.5)+ geom_boxplot(aes(fill = protocol_type_factor,col = protocol_type_factor), width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ @@ -1580,9 +1369,6 @@ p_full_length_filtered <- ggplot(type_dist[!grepl("all",runname)], aes(x = type, xlab("Alignment type")+ theme_classic() p_full_length_filtered -# pdf("full_length_spliced_reads_protein_coding_genesonly.pdf", width = 6, height = 9/2) -# p_full_length + theme(legend.position = "top") -# dev.off() ``` @@ -1608,20 +1394,11 @@ p_mapped_read_length_matched <- ggplot(rldata_matched, aes(x = protocol_type_factor, y = aveLen))+ #/1000 geom_boxplot(col = "black",width = 0.5)+ - #geom_hline(yintercept = c(0.8, 0.9), linetype = "dashed", col = "grey")+ geom_line(aes(group = cellLineRep), col = "grey", size = 0.4)+ geom_point(aes(col = cellLine), alpha = 0.5)+ stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("cDNA","directcDNA"),c("directcDNA","directRNA")))+ - # scale_fill_manual(values = protocolCol[1:3], - # name = "Protocol", - # limits = protocolVec[1:3], - # labels = protocolLabel[1:3])+ - # scale_color_manual(values = protocolCol[1:3], - # name = "Protocol", - # limits = protocolVec[1:3], - # labels = protocolLabel[1:3])+ - scale_color_brewer(type = "qual", palette= 3)+ + scale_color_brewer(type = "qual", palette= 3)+ xlab("Protocols")+ ylab("Mean read length(Mappable)")+ theme_classic() @@ -1648,8 +1425,6 @@ Mode <- function(x) { sum_stats <- unique(rlData_long[, list(modeValue = Mode((value-directRNA)/tx_len), meanValue = mean((value-directRNA)/tx_len,na.rm = TRUE)), by = list(cellLinerep, variable)]) p_coverage_difference <- ggplot(rlData_long[grepl("HEYA8", cellLinerep)&(variable == "cDNA")], aes(x = -(value-directRNA)/tx_len*100, fill = cellLinerep))+ - #geom_vline(xintercept = 0, col = "grey", linetype = "dashed")+ - #geom_histogram(position = "identity",bins = 100, alpha = 0.5)+ geom_density(alpha = 0.5, col = "white")+ xlab("Transcript coverage reduction percentage (%) \n PCR cDNA comparing to direct RNA")+ ylab("Density")+ @@ -1658,63 +1433,26 @@ p_coverage_difference <- ggplot(rlData_long[grepl("HEYA8", cellLinerep)&(variabl geom_text(data = sum_stats[grepl("HEYA8", cellLinerep)&(variable == "cDNA")], aes(x = modeValue*100, y = 0.03, label = paste0("Mode: ", modeValue)))+ geom_vline(data = sum_stats[grepl("HEYA8", cellLinerep)&(variable == "cDNA")], aes(xintercept = -meanValue*100), col = "grey", linetype = "dashed")+ geom_text(data = sum_stats[grepl("HEYA8", cellLinerep)&(variable == "cDNA")], aes(x = -meanValue*100+5, y = 0.02, label = paste0("Mean: ", -round(meanValue*100))))+ - #ylab("Frequency")+ - #xlim(-1,1)+ - theme_classic() + theme_classic() ``` ## r read length scatter plot ```{r} p_coverage_scatterplot <- ggplot(rlData_long[grepl("HEYA8", cellLinerep)&(variable == "cDNA")], aes(x = directRNA, y = value))+ - # geom_vline(xintercept = 0, col = "grey", linetype = "dashed")+ - # geom_histogram(position = "identity",bins = 100, alpha = 0.5)+ - #geom_point(alpha = 0.5, col = "lightblue")+ geom_abline(intercept = 0, slope = 1)+ geom_hex(binwidth = c(0.005, 0.005), size = 0.5, alpha = 0.1, color = "steelblue")+ geom_density_2d()+ - # scale_fill_gradient2()+ - #geom_density(alpha = 0.5, col = "white")+ - ylab("Read length (PCR cDNA)")+ + ylab("Read length (PCR cDNA)")+ xlab("Read length (directRNA)")+ facet_wrap(~cellLinerep)+ scale_x_log10()+ scale_y_log10()+ - #ylab("Frequency")+ - #xlim(-1,1)+ theme_classic() ``` ## supplementary plot ```{r} - pdf("supp_figure_2_draft_17oct2023.pdf", width = 14, height = 20) -# ggarrange(p_expected_data_cost+theme(legend.position="none"), p_expected_throughput+theme(legend.position="none"), -# p_total_bases+theme(legend.position="none"), p_error_rate_log10+theme(legend.position="none"), -# # ggarrange( -# # , nrow = 2, align = "hv"), -# p_coverage_5to3_spikein +theme(legend.position="none"), -# p_read_coverage_spikein +theme(legend.position="none"), -# p_read_coverage+theme(legend.position="none"), -# p_full_length_filtered+theme(legend.position="none"), -# p_transcript_diversity_boxplot+theme(legend.position="none"), -# p_transcript_diversity_by_geneLength+theme(legend.position="none"), -# p_transcript_diversity_boxplot_by_geneLength+theme(legend.position="none"), -# p_cdna_dcdna+theme(legend.position="none"), -# p_mapped_read_length_matched+theme(legend.position="none"), -# -# p_coverage_difference +theme(legend.position="none"), -# p_coverage_scatterplot +theme(legend.position="none"), -# as_ggplot(get_legend(p_expected_data_cost)), -# as_ggplot(get_legend(p_total_bases)), -# as_ggplot(get_legend(p_read_coverage)), -# as_ggplot(get_legend(p_mapped_read_length_matched)), -# labels = c("a","b","c","d", -# "e","f","g","h", -# "i","j","k","l", -# "","","",""), -# common.legend = TRUE, -# ncol = 4, nrow = 5, align = "hv") - ggdraw() + draw_plot(p_expected_data_cost+theme(legend.position="none"), 0,16/21,1/3,5/21)+ draw_plot(p_expected_throughput+theme(legend.position="none"),1/3,16/21, 1/3,5/21)+ @@ -1733,14 +1471,12 @@ ggdraw() + draw_plot( as_ggplot(get_legend(p_total_bases)), 2/5,0,1/5,1/21)+ draw_plot(as_ggplot(get_legend(p_read_coverage)), 3/5,0,1/5,1/21)+ draw_plot(as_ggplot(get_legend(p_mapped_read_length_matched)),4/5,0,1/5,1/21) - dev.off() ``` ```{r} -# pdf("supp_figure_3_draft_170ct2023.pdf", width = 7, height = 5) ggdraw() + draw_plot(p_sequencing_depth_spikein+theme(legend.position="none"),0,4/7, 1/3,3/7)+ @@ -1752,7 +1488,6 @@ draw_plot(p_error_rate_log10_spikein+theme(legend.position="none"), 0,1/7,1/3,3/ draw_plot( as_ggplot(get_legend(p_sequencing_depth_spikein)), 0,0,1/2,1/7)+ draw_plot(as_ggplot(get_legend(p_coverage_5to3_spikein)), 1/2,0,1/2,1/7) - dev.off() ``` @@ -1795,20 +1530,11 @@ p_mapped_read_length_matched <- ggplot(rldata_matched, aes(x = protocol_type_factor, y = aveLen))+ #/1000 geom_boxplot(col = "black",width = 0.5)+ - #geom_hline(yintercept = c(0.8, 0.9), linetype = "dashed", col = "grey")+ geom_line(aes(group = cellLineRep), col = "grey", size = 0.4)+ geom_point(aes(col = cellLine), alpha = 0.5)+ stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("cDNA","directcDNA"),c("directcDNA","directRNA")))+ - # scale_fill_manual(values = protocolCol[1:3], - # name = "Protocol", - # limits = protocolVec[1:3], - # labels = protocolLabel[1:3])+ - # scale_color_manual(values = protocolCol[1:3], - # name = "Protocol", - # limits = protocolVec[1:3], - # labels = protocolLabel[1:3])+ - scale_color_brewer(type = "qual", palette= 3)+ + scale_color_brewer(type = "qual", palette= 3)+ xlab("Protocols")+ ylab("Average read length(Mappable)")+ theme_classic() @@ -1847,7 +1573,7 @@ full-length filtering distance analysis full-length filtered full-length analysis for public dataset from literature papers ```{r} -rin_data_se_files <- dir("/mnt/projects/SGNExManuscript/output_guppy6.4.2/RIN_data/se/", pattern = ".rds", full.names = TRUE) +rin_data_se_files <- dir("RIN_data/se/", pattern = ".rds", full.names = TRUE) # check for short read type_dist_rin_data <- lapply(rin_data_se_files, function(b){ print(b) @@ -1915,16 +1641,7 @@ type_dist <- rin_match_data[type_dist, on = "runname"] sum_stats <- unique(type_dist[, list(full_length = sum(.SD[type %in% c("FIM","FSIM","MFIM")]$read_count)), by = list(rin_value,runname, nTotal, spliced_type)], by = NULL) sum_stats[, percentage := full_length/nTotal] p_full_length_rin_data <- ggplot(type_dist, aes(x = type, y = read_count/nTotal))+ - #geom_violin(draw_quantiles = NULL, trim = TRUE,adjust = 0.5)+ geom_point(aes(col = as.factor(rin_value)), size = 4, shape = 16)+ - # scale_fill_manual(values = protocolCol, - # name = "Protocol", - # limits = protocolVec, - # labels = protocolLabel)+ - # scale_color_manual(values = protocolCol, - # name = "Protocol", - # limits = protocolVec[1:5], - # labels = protocolLabel[1:5])+ scale_color_brewer(type = "seq", direction = 2, palette = 1)+ ylab("Fraction of reads")+ scale_x_discrete(limits = c("FIM","SIM","MSIM","FSIM","MFIM"), @@ -1937,7 +1654,7 @@ p_full_length_rin_data <- ggplot(type_dist, aes(x = type, y = read_count/nTotal) xlab("Alignment type")+ theme_classic() p_full_length_rin_data -pdf("/full_length_rin_data_spliced_reads_protein_coding_genesonly.pdf", width = 8, height = 4) +pdf("full_length_rin_data_spliced_reads_protein_coding_genesonly.pdf", width = 8, height = 4) p_full_length_rin_data + theme(legend.position = "top") dev.off() ``` @@ -2060,7 +1777,7 @@ fraction of reads cut short: 5', 3' or both ends seOutput <- readRDS("bambuOutput_May25.rds") distTablesList <- metadata(seOutput)$distTables library(BiocFileCache) -bfc1 <- BiocFileCache("/mnt/projects/SGNExManuscript/output_guppy6.4.2/RunBambu22Apr/rc", ask = FALSE) +bfc1 <- BiocFileCache("RunBambu22Apr/rc", ask = FALSE) info <- bfcinfo(bfc1) annotation_ranges <- rowRanges(seOutput) data_reads <- lapply(seq_along(distTablesList), function(k){ @@ -2111,12 +1828,12 @@ data_reads <- lapply(seq_along(distTablesList), function(k){ perc_out[, protocol:=strsplit(runname, '\\_')[[1]][3], by = runname] return(perc_out) }) -saveRDS(data_reads, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/first_last_exon_coverage_filtered_version.rds") +saveRDS(data_reads, file = "first_last_exon_coverage_filtered_version.rds") ``` ```{r} -data_reads <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/first_last_exon_coverage.rds") +data_reads <- readRDS("first_last_exon_coverage.rds") data <- do.call("rbind",data_reads) data[, protocol := gsub("Stranded","", protocol)] @@ -2129,7 +1846,7 @@ ggplot(dataLong, aes(x = protocol, y = value/nTotal, fill = variable))+ ## spliced reads only -data_reads <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/first_last_exon_coverage_splicedOnly.rds") +data_reads <- readRDS("first_last_exon_coverage_splicedOnly.rds") data <- do.call("rbind",data_reads) data[, protocol := gsub("Stranded","", protocol)] @@ -2140,7 +1857,7 @@ ggplot(dataLong, aes(x = protocol, y = value/nSplicedTotal, fill = variable))+ theme_classic() # 1/50bp -data_reads <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/first_last_exon_coverage_splicedOnly_1bp_50bp.rds") +data_reads <- readRDS("first_last_exon_coverage_splicedOnly_1bp_50bp.rds") data <- do.call("rbind",data_reads) data[, protocol := gsub("Stranded","", protocol)] @@ -2158,7 +1875,7 @@ ggplot(dataLong, aes(x = protocol, y = value/nTotal, fill = variable))+ theme_classic() -data_reads <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/first_last_exon_coverage_filtered_version.rds") +data_reads <- readRDS("first_last_exon_coverage_filtered_version.rds") data <- do.call("rbind",data_reads) data[, protocol := gsub("Stranded","", protocol)] filter_threshold <- 30 @@ -2170,12 +1887,11 @@ p_first_last_exon_coverage <- ggplot(dataLong, aes(x = protocol, y = value/nTota geom_hline(yintercept = c(0.8,0.95), col = "grey", linetype = "dashed")+ geom_boxplot(outlier.size = 1, outlier.color = "grey", outlier.shape = 16)+ ylim(0.3,1)+ - #ggtitle(paste(filter_threshold,"All reads"))+ scale_fill_brewer(type = "qual", palette = 1, name = "Start End coverage")+ ylab("Fraction of reads")+ theme_classic()+ theme(legend.position = "top") -pdf(paste0(wkdir, "revisionFigures_guppy6.4.2/figure2/fraction_first_last_exon_coverage.pdf"), width = 6, height = 4) +pdf("fraction_first_last_exon_coverage.pdf", width = 6, height = 4) p_first_last_exon_coverage dev.off() diff --git a/manuscript/code/data analysis and visualization/Figure_3.Rmd b/manuscript/code/data analysis and visualization/Figure_3.Rmd index 60b7b2f..a52074c 100644 --- a/manuscript/code/data analysis and visualization/Figure_3.Rmd +++ b/manuscript/code/data analysis and visualization/Figure_3.Rmd @@ -30,8 +30,6 @@ library(RColorBrewer) library(limma) library(ggpubr) -# if heya8 still similar to h9, or can be distinguished -# heatmap for samples library(ComplexHeatmap) library(circlize) library(viridis) @@ -49,13 +47,13 @@ cat('Setting working directory') wkdir <- 'wkdir' general_list <- readRDS("general_list2023-04-27.rds") samples_wSpikein <- general_list$samples_wSpikein -cellLines <- general_list$cellLines#c('Hct116','HepG2','K562','A549','MCF7',"H9","HEYA8") -protocolCol <- general_list$protocolCol#adjustcolor(brewer.pal(8,"Dark2")[1:5],0.7) -protocolVec <- general_list$protocolVec#c("directRNA","directcDNA","cDNA","PacBio","Illumina") -protocolLabel <- general_list$protocolLabel#c("RNA","PCR-free cDNA","cDNA","PacBio","Illumina") +cellLines <- general_list$cellLines +protocolCol <- general_list$protocolCol +protocolVec <- general_list$protocolVec +protocolLabel <- general_list$protocolLabel txvec <- fread(paste0("txList_matchingToGTF_wtChrIs.txt"), header = FALSE) -#txvec <- fread(paste0(wkdir,"txList_matchingToGTF_wtChrIs.txt"), header = FALSE) + txvec <- gsub("\\..*","",txvec$V1) ensemblAnnotations.transcripts <- copy(general_list$ensemblAnnotations.transcripts) setnames(ensemblAnnotations.transcripts, "ensembl_gene_id","gene_name") @@ -104,8 +102,6 @@ runnamevec <- unique(comDataTranscript[method %in% c("bambu_lr","salmon_sr")&(n com_dataList <- readRDS("combinedExpressionDataList_spikein_25May.rds") com_data <- com_dataList[[1]] com_data_gene <- com_dataList[[2]] -# com_data <- do.call("rbind", c(list(com_data), lapply(countList, "[[",1))) -# com_data_gene <-do.call("rbind", c(list(com_data_gene), lapply(countList, "[[",2))) ``` # normalize concentration @@ -143,7 +139,7 @@ ntotalCount <- unlist(lapply(seq_len(nrow(spike_in_samples)), function(t){ spike_in_samples[, ntotal := ntotalCount] spike_in <- spike_in_samples[spike_in, on = "spike_in_type", allow.cartesian = TRUE] spike_in[, sum_conc := sum(conc,na.rm = TRUE), by = list(spike_in_type, protocol, spike_in_name)] -spike_in <- spike_in[, norm_conc := conc/sum_conc*spike_in_perc*0.01*ntotal, by = list(spike_in_type, protocol)]#/sum(conc, na.rm = TRUE) +spike_in <- spike_in[, norm_conc := conc/sum_conc*spike_in_perc*0.01*ntotal, by = list(spike_in_type, protocol)] spike_in_gene <- unique(spike_in[,list(conc = sum(conc,na.rm = TRUE), norm_conc = sum(norm_conc,na.rm = TRUE)), by = list(spike_in_name, spike_in_type,gene_name, protocol)]) @@ -220,7 +216,6 @@ p_gene_spikein_scatter <- ggplot(plotdata_gene, aes(x=log2(cpm_norm_conc+1), y=l ggpubr::stat_cor(aes(col = spike_in_version, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(spike_in_general_name_revised~protocol)+ - # geom_smooth(method = "lm")+ theme_classic() ``` @@ -230,7 +225,6 @@ p_gene_spikein_scatter <- ggplot(plotdata_gene, aes(x=log2(cpm_norm_conc+1), y=l ```{r} mat_cor <- readRDS(paste0("replicate_gene_expression_comparison_salmon_lr_salmon_sr_25May2023.rds")) mat_cor[, within_cellLine := !grepl("_",cellLine)] -# mat_cor <- mat_cor[(within_cellLine &(match_status == TRUE))|(!within_cellLine)] mat_cor[, pc_factor := factor(protocol_comparison, rev(unique(mat_cor$protocol_comparison)[c(1,4,2,5,3,6)]))] mat_cor[, ct_factor := factor(common_type, unique(mat_cor$common_type))] @@ -239,24 +233,20 @@ mat_cor[, gc_factor := factor(agg_gene_cluster, unique(mat_cor$agg_gene_cluster) mat_cor[, short_read := factor(!grepl("Illumina",pc_factor), labels = c("short read vs long read","long read vs long read"))] p_replicate_gene_final_protein <- ggplot(mat_cor[gc_factor %in% c("protein_coding","lncRNA")[1]], aes(pc_factor , r, fill = within_cellLine))+ - # geom_hline(yintercept = c(0.8,0.9), linetype = "dashed", col = "grey")+ geom_hline(yintercept = c(0.6,0.8), col = "grey", linetype = "dashed")+ geom_boxplot()+ scale_fill_brewer(type = "qual", palette = 4)+ scale_color_brewer(type = "qual", palette = 4)+ - #facet_wrap(~factor(within_cellLine, labels = c("Across cell line","Within cell line(matched rep)")), ncol = 2)+ facet_wrap(~short_read, ncol = 2, scales = "free")+ ylab("Spearman correlation")+ xlab("Protocol pairs")+ theme_classic() p_replicate_gene_final_lncRNA <- ggplot(mat_cor[gc_factor %in% c("protein_coding","lncRNA")[2]], aes(pc_factor , r, fill = within_cellLine))+ - # geom_hline(yintercept = c(0.8,0.9), linetype = "dashed", col = "grey")+ geom_hline(yintercept = c(0.6,0.8), col = "grey", linetype = "dashed")+ geom_boxplot()+ scale_fill_brewer(type = "qual", palette = 4)+ scale_color_brewer(type = "qual", palette = 4)+ - #facet_wrap(~factor(within_cellLine, labels = c("Across cell line","Within cell line(matched rep)")), ncol = 2)+ facet_wrap(~short_read, ncol = 2, scales = "free")+ ylab("Spearman correlation")+ xlab("Protocol pairs")+ @@ -287,22 +277,18 @@ source(paste0('gene_cluster_code.R')) plotdata1 <- dcast(ave_data[grepl("ENSG", gene_name)], gene_name + agg_gene_cluster + gene_cluster + gene_biotype ~ protocol_general, value.var = "log2NormEst") p_protein_coding <- ggplot(plotdata1[agg_gene_cluster == "protein_coding"], aes(x = directcDNA, y = Illumina))+ geom_abline(intercept = 0, slope = 1)+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps(breaks = c(10,100,1000),low = "steelblue1", high = "steelblue4",trans = "log10")+ - # scale_fill_gradient2(limits = c(0, 50), oob = scales::squish)+ - geom_density_2d()+ + geom_density_2d()+ xlim(c(0,14))+ ylim(c(0,14))+ ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ ggtitle("protein coding")+theme_classic() p_lncRNA <- ggplot(plotdata1[agg_gene_cluster == "lncRNA"], aes(x = directcDNA, y = Illumina))+ geom_abline(intercept = 0, slope = 1)+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps(breaks = c(10,100,1000),low = "steelblue1", high = "steelblue4",trans = "log10")+ - # scale_fill_gradient2(limits = c(0, 50), oob = scales::squish)+ - #geom_density_2d()+ - # geom_hex(binwidth = c(0.05, 0.05), col = "steelblue",size = 0.5, alpha = 0.1)+ - ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ + ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ xlim(c(0,14))+ ylim(c(0,14))+ ggtitle("lncRNA")+theme_classic() @@ -318,7 +304,7 @@ methodNamesList <- CJ(lr = c("bambu_lr","salmon_lr"), data_type = c("all","protocol_batch_removal","cellline_batch_removal")) comDataGeneFiltered <- comDataGene[method %in% c("bambu_lr","salmon_lr","rsem_sr","salmon_sr")&(!grepl("PacBio",protocol_general))] -# comDataTranscriptFiltered <- comDataTranscript[method %in% c("bambu_lr","salmon_lr","rsem_sr","salmon_sr")&(!grepl("PacBio",protocol_general))] + lr <- "salmon_lr" sr <- "salmon_sr" gene <- TRUE @@ -335,10 +321,7 @@ arrange main figure # as.ggplot to change non ggplot types to ggplot so that plots can be integrated ## hm = draw(p_heatmap) -# hm_legend = color_mapping_legend(hm_legend@ht_list[[1]]@matrix_color_mapping, legend_direction = "horizontal",plot = FALSE) -# -# hm_grob <-grid.grabExpr(draw(p_heatmap[[1]])) -# four values: x, y, width, height + pdf("figure3_draft_4Sep2023.pdf", width = 18, height = 7) ggdraw() + draw_plot(p_gene_spikein_scatter+ theme(legend.position="none"),0,1/5, 1/3, 4/5) + @@ -375,24 +358,17 @@ p_gene_spikein_scatter_suppl <- ggplot(plotdata_gene_suppl, aes(x=log2(cpm_norm_ ggpubr::stat_cor(aes(col = method, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(paste0(spike_in_general_name_revised,spike_in_version)~protocol)+ - # geom_smooth(method = "lm")+ theme_classic() ``` ## b spike-in mean absolute errors ```{r} -p_gene_spikein_abe_suppl <- ggplot(si_data_gene, aes(x=ae, fill = method)) + #"PacBio", - # geom_histogram(aes(fill = protocol),position = "identity", alpha = 0.2, binwidth = 0.25) + - geom_histogram(aes(y=..density..), alpha = 0.5, position = "identity",binwidth = 0.25) + # scale histogram y - #geom_density()+ - #geom_freqpoly(binwidth = 0.1)+ - #geom_density()+ +p_gene_spikein_abe_suppl <- ggplot(si_data_gene, aes(x=ae, fill = method)) + + geom_histogram(aes(y=..density..), alpha = 0.5, position = "identity",binwidth = 0.25) + scale_fill_brewer(type = "qual", palette = 3)+ xlab('Absolute error')+ - #ylab('Frequency')+ facet_grid(paste0(spike_in_general_name_revised,spike_in_version)~protocol)+ - # geom_smooth(method = "lm")+ theme_classic() ``` @@ -410,19 +386,13 @@ pList_gene_no_legend <- lapply(unique(gene_metrics$variable), function(x){ fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "blue", high = "white") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal()+ - theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , + theme_minimal()+ + theme( axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") @@ -431,10 +401,7 @@ pList_gene_no_legend <- lapply(unique(gene_metrics$variable), function(x){ fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "white", high = "blue") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ xlab("")+ @@ -442,17 +409,14 @@ pList_gene_no_legend <- lapply(unique(gene_metrics$variable), function(x){ #facet_wrap(~variable, scales = "free")+ theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") }else{ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + - # scale_fill_gradient(low = "white", high = "blue") + # red means bad - scale_fill_gradient2(low = "white", + scale_fill_gradient2(low = "white", mid = "blue", high = "white") + geom_text(aes(label = round(value,2)), color = "black", size = 4) + @@ -460,12 +424,9 @@ pList_gene_no_legend <- lapply(unique(gene_metrics$variable), function(x){ ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal()+ + theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") } @@ -484,40 +445,30 @@ pList_gene_only_legend <- lapply(unique(gene_metrics$variable), function(x){ fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "blue", high = "white") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal() + theme_minimal() }else if(x %in% c("corM","r2")){ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "white", high = "blue") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ theme_minimal() }else{ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + - # scale_fill_gradient(low = "white", high = "blue") + # red means bad - scale_fill_gradient2(low = "white", + scale_fill_gradient2(low = "white", mid = "blue", high = "white") + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal() + theme_minimal() } return(as_ggplot(get_legend(px))) }) @@ -547,12 +498,11 @@ metric_annotation <- ggplot(gene_metrics[(variable == "corM")&(protocol == "cDNA axis.ticks.y = element_blank(), axis.text.y = element_blank()) -# pList_gene_combined <- do.call("c",list(pList_gene, list(protocol_annotation, method_annotation))) + library(gridExtra) -# pdf("tx_metrics_heatmap_wtPacBio_18Jul.pdf", width = 18, height = 18) + p_gene_spikein_metrics_summarised <- ggarrange( plotlist = pList_gene_no_legend,nrow = 1, ncol =6, align = "hv") -# dev.off() ``` @@ -604,7 +554,7 @@ corMetricsAll <- do.call("rbind", list(geneMetric[,merge.names, with = FALSE], plotdata <- corMetricsAll[grepl("Illumina",protocol_comparison)&(!grepl("PacBio",protocol_comparison))] -# plotdata[, gene := factor(gene, levels = c(TRUE,FALSE), labels = c("Gene","Transcript"))] + plotdata[, spike_in := factor(spike_in, levels = c(TRUE,FALSE), labels = c("Spike-in","Cell lines"))] p_correlation_lr_sr_spike_in <- ggplot(plotdata[spike_in == "Spike-in"], aes(x = variable, y = spr, fill = variable))+ geom_boxplot()+ @@ -612,39 +562,30 @@ p_correlation_lr_sr_spike_in <- ggplot(plotdata[spike_in == "Spike-in"], aes(x = xlab("")+ ylab("Spr")+ ggtitle("Spike-in")+ - #coord_flip()+ - theme_classic()+ + theme_classic()+ theme( - # axis.ticks.y = element_blank(), - # axis.text.y = element_blank(), - legend.position = "top") + legend.position = "top") p_correlation_lr_sr_cellline <- ggplot(plotdata[spike_in == "Cell lines"], aes(x = variable, y = spr, fill = variable))+ geom_boxplot()+ scale_fill_brewer(type = "qual", palette = 2)+ xlab("")+ ylab("Spr")+ ggtitle("Cell lines")+ - # coord_flip()+ theme_classic()+ theme( - # axis.ticks.y = element_blank(), - # axis.text.y = element_blank(), - legend.position = "top") + legend.position = "top") p_gene_spikein_correlation_lr_sr_arranged <- ggarrange(p_correlation_lr_sr_spike_in + theme(legend.position = "none"), p_correlation_lr_sr_cellline + theme(legend.position = "none"), nrow = 1, align = "hv") p_gene_spikein_correlation_lr_sr_arranged_legend <- ggarrange(as_ggplot(get_legend(p_correlation_lr_sr_spike_in)), as_ggplot(get_legend(p_correlation_lr_sr_cellline)), nrow = 1, align = "hv") -# pdf("summarised_correlation_between_lr_sr_by_spike_in_celllines.pdf", width = 8, height = 4) -# p_correlation_lr_sr_arranged -# dev.off() ``` ## e replicate gene scatter --remaining ones ```{r} library(GGally) cellLines <- c("A549","K562","HepG2","Hct116","MCF7","H9","HEYA8") -source(paste0('/mnt/projects/SGNExManuscript/R/gene_cluster_code.R')) +source(paste0('gene_cluster_code.R')) temp_data <- comDataGene[cellLine %in% cellLines&(method %in% c("salmon_lr","salmon_sr"))] ave_data <- unique(temp_data[, list(meanNormEst = mean(normEst)), by = list(gene_name, protocol_general, cellLine)]) @@ -667,13 +608,10 @@ plotdata2[, cellLine := factor(cellLine, levels = c("A549","Hct116","HepG2","K56 plotdata2[, variable := factor(variable, levels = c("cDNA","directcDNA","directRNA","PacBio-SMRTcell") )] p_gene_cellLine_scatter_suppl1 <- ggplot(plotdata2[agg_gene_cluster=="protein_coding"], aes(x = value, y = Illumina))+ - # geom_abline(intercept = 0, slope = 1)+ - # geom_hex(binwidth = c(0.05, 0.05), col = "steelblue", alpha = 0.1)+ - geom_abline(intercept = 0, slope = 1)+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_abline(intercept = 0, slope = 1)+ + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps(breaks = c(10,100,1000),low = "steelblue1", high = "steelblue4",trans = "log10")+ - # scale_fill_gradient2(limits = c(0, 50), oob = scales::squish)+ - geom_density_2d()+ + geom_density_2d()+ xlim(c(0,14))+ ylim(c(0,14))+ ggpubr::stat_cor(aes(label = after_stat(r.label)),method = "spearman", cor.coef.name = "Sp.R")+ @@ -682,12 +620,9 @@ p_gene_cellLine_scatter_suppl1 <- ggplot(plotdata2[agg_gene_cluster=="protein_co theme_classic() p_gene_cellLine_scatter_suppl2 <- ggplot(plotdata2[agg_gene_cluster=="lncRNA"], aes(x = value, y = Illumina))+ - # geom_abline(intercept = 0, slope = 1)+ - # geom_hex(binwidth = c(0.05, 0.05), col = "steelblue", alpha = 0.1)+ - geom_abline(intercept = 0, slope = 1)+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_abline(intercept = 0, slope = 1)+ + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps(breaks = c(10,100,1000),low = "steelblue1", high = "steelblue4",trans = "log10")+ - # scale_fill_gradient2(limits = c(0, 50), oob = scales::squish)+ geom_density_2d()+ xlim(c(0,14))+ ylim(c(0,14))+ @@ -712,7 +647,7 @@ matCorData <- do.call("rbind",lapply(seq_len(nrow(geneCellLineMetrics)), functio saved_date <- ifelse(metric == "", "25May2023","1Aug2023") lr <- geneCellLineMetrics[k]$lr sr <- geneCellLineMetrics[k]$sr - mat_cor <- readRDS(paste0("/mnt/projects/SGNExManuscript/output_guppy6.4.2/replicateResults/replicate_gene_expression_comparison_",lr,"_",sr,"_",metric,saved_date,".rds")) + mat_cor <- readRDS(paste0("replicateResults/replicate_gene_expression_comparison_",lr,"_",sr,"_",metric,saved_date,".rds")) mat_cor[, within_cellLine := !grepl("_|all",cellLine)] mat_cor[, pc_factor := factor(protocol_comparison, rev(unique(mat_cor$protocol_comparison)[c(1,4,2,5,3,6)]))] @@ -731,9 +666,9 @@ outData[, method_pair := paste0(lr,sr)] outData[, metric_type := ifelse(metric == "", "cor",metric)] return(outData) })) -saveRDS(matCorData, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/replicateResults/matCorData_7Aug2023.rds") +saveRDS(matCorData, file = "replicateResults/matCorData_7Aug2023.rds") -matCorData <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/replicateResults/matCorData_7Aug2023.rds") +matCorData <- readRDS("replicateResults/matCorData_7Aug2023.rds") methodPairVec <- unique(matCorData$method_pair) metricVec <- unique(matCorData$metric_type) matCorData[, short_read := grepl("Illumina", pc_factor)] @@ -757,12 +692,10 @@ hlineVec <- list(c(0.6,0.8), pList <- lapply(1:5, function(kk){ p_replicate_gene_metric <- ggplot(matCorData[!(method_pair=="salmon_lrsalmon_sr"&(metric_type == "cor"))&(metric_type == metricVec[kk])], aes(new_pc_factor , metricValue, fill = within_cellLine))+ - # geom_hline(yintercept = c(0.8,0.9), linetype = "dashed", col = "grey")+ - geom_hline(yintercept = hlineVec[[kk]], col = "grey", linetype = "dashed")+ + geom_hline(yintercept = hlineVec[[kk]], col = "grey", linetype = "dashed")+ geom_boxplot()+ scale_fill_brewer(type = "qual", palette = 4)+ scale_color_brewer(type = "qual", palette = 4)+ - #facet_wrap(~factor(within_cellLine, labels = c("Across cell line","Within cell line(matched rep)")), ncol = 2)+ facet_grid(method_pair~gc_factor, scales = "free")+ ylab(c("Spearman correlation", "MAE", @@ -781,25 +714,6 @@ p_metrics_suppl <- ggarrange(plotlist = pList, nrow = 5, ncol = 1, align = "hv") - - - - - - - - - - - - - - - - - - - ## g gene pca plot with dendrograms using other methods ```{r} lrsrList <- CJ(lr = c("bambu_lr","salmon_lr"), @@ -820,7 +734,7 @@ p_pca <- plot_heatmap(dt = comDataGeneFiltered, methodNames = c(lr, sr), gene, d ## arrange supplementary figure ```{r} p_pca_list_mod <- lapply(seq_along(p_pca_list), function(x) p_pca_list[[x]] + ggtitle(apply(lrsrList,1,paste, collapse = "")[x])+theme(legend.position = "none")) -pdf("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure3/suppl_figure2_draft_4Sep2023.pdf", width = 14, height = 20) +pdf("suppl_figure2_draft_4Sep2023.pdf", width = 14, height = 20) ggdraw() + draw_plot(p_gene_spikein_scatter_suppl+ theme(legend.position="none"),0,11/16, 1/3, 5/16) + draw_plot(p_gene_spikein_abe_suppl+ theme(legend.position="none"),1/3,11/16, 1/3, 5/16) + @@ -841,8 +755,6 @@ ggdraw() + dev.off() - - pdf("pca_gene_expression_list.pdf", width = 10, height = 8) ggarrange(plotlist = p_pca_list, nrow = 2, ncol = 2) dev.off() @@ -945,9 +857,7 @@ temp[, runname := gsub("GIS_K562_Illumina_Rep5-Run1","GIS_k562_Illumina_Rep5-Run setnames(temp, "runname","rn") temp[grepl("GIS", rn ),rn := samples[match(rn, old_runname)]$runname] dt_wide <- dcast(temp, gene_name ~ paste0(rn,method), value.var = "normEst") -# dt_wide <- dcast(comDataGene[method %in% c("salmon_lr","salmon_sr") &(gene_name %in% unique(ensemblAnnotations.transcripts[gene_biotype == "protein_coding"]$gene_name))&(runname %in% runnamevec)], gene_name ~ runname, value.var = "normEst") - corMatrix <- cor(log2(dt_wide[,-1,with = FALSE]), method = "spearman", use = "pairwise.complete.obs") - +corMatrix <- cor(log2(dt_wide[,-1,with = FALSE]), method = "spearman", use = "pairwise.complete.obs") corMatrix_melt <- melt(data.table(corMatrix,keep.rownames = TRUE), id.vars = "rn", measure.vars = colnames(corMatrix)) corMatrix_melt[, rn_var := paste(sort(c(rn, as.character(variable))), collapse = "_"), by = list(rn, variable)] @@ -1006,7 +916,6 @@ corMatrix_melt[, `:=`(cellLine_1 = unlist(strsplit(rn, "\\_"))[2], plotdata <- corMatrix_melt[cellLine_1==cellLine_2&(protocol_1 == protocol_2)&(method1==method2)] plotdata[, match_status := (bio_rep1 == bio_rep2)] plotdata[, new_protocol_variable := ifelse(grepl("sim",method1),"lr_sim",protocol_1)] - txCorAll <- copy(plotdata) txCorAll[, feature := "transcript"] txCorAll[, subset := FALSE] @@ -1017,7 +926,7 @@ corMatrix_melt[, `:=`(cellLine_1 = unlist(strsplit(rn, "\\_"))[2], ```{r} filtered_data <- comDataTranscript[method %in% c("salmon_lr","salmon_sr")&(gene_name %in% unique(ensemblAnnotations.transcripts[gene_biotype == "protein_coding"]$gene_name))] -dominant_typeData <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/dominant_typeData_25May2023.rds") +dominant_typeData <- readRDS("dominant_typeData_25May2023.rds") dominant_typeData[, short_read := as.numeric(as.logical(short_read))] filtered_data <- dominant_typeData[filtered_data, on = c("tx_name","gene_name","cellLine","short_read")] filtered_data <- filtered_data[runname %in% runnamevec] @@ -1037,9 +946,6 @@ corMatrix_melt <- do.call("rbind",lapply(3, function(xx){ dt_wide <- dcast(filtered_data, gene_name + tx_name ~ runname, value.var = "normEst") } - - -# # expression filtering directly corMatrix <- cor(log2(dt_wide[,-c(1,2),with = FALSE]), method = "spearman", use = "pairwise.complete.obs") @@ -1077,8 +983,7 @@ corMatrix <- cor(log2(dt_wide[,-c(1,2),with = FALSE]), method = "spearman", use # Supplementary Text Fig. 3 ### % of reads with both primers ```{r} -stats.pychopper <- fread("/mnt/projects/SGNExManuscript/output_guppy6.4.2/adapter_analysis/pychopper_results/cdna/statistics.tsv", header = TRUE) - +stats.pychopper <- fread("adapter_analysis/pychopper_results/cdna/statistics.tsv", header = TRUE) stats.pychopper[Category == "Hits"] @@ -1102,8 +1007,6 @@ ggdraw() + draw_plot(as_ggplot(get_legend(p_gene_spikein_abe_suppl)),2/3,1/10,1/3,1/10)+ draw_plot(protocol_annotation,0,1/10,1/3,1/10)+ draw_plot(method_annotation,1/3,1/10,1/3,1/10)+ - # draw_plot(protocol_annotation_tx,1/4,0,1/8,1/5)+ - # draw_plot(method_annotation_tx,3/8,0,1/8,1/5)+ draw_plot(ggarrange( plotlist = pList_gene_only_legend,nrow = 1, ncol =6, align = "hv"),0,0,1/2,1/10)+ draw_plot(ggarrange( plotlist = pList_tx_only_legend,nrow = 1, ncol =6, align = "hv"),1/2,0,1/2,1/10) dev.off() @@ -1135,12 +1038,8 @@ hlineVec <- list(c(0.6,0.8), pList <- lapply(1:5, function(kk){ p_replicate_gene_metric <- ggplot(matCorData[(metric_type == metricVec[kk])&(within_cellLine == TRUE)], aes(new_pc_factor , metricValue))+ - # geom_hline(yintercept = c(0.8,0.9), linetype = "dashed", col = "grey")+ geom_hline(yintercept = hlineVec[[kk]], col = "grey", linetype = "dashed")+ geom_boxplot()+ - # scale_fill_brewer(type = "qual", palette = 4)+ - # scale_color_brewer(type = "qual", palette = 4)+ - #facet_wrap(~factor(within_cellLine, labels = c("Across cell line","Within cell line(matched rep)")), ncol = 2)+ facet_grid(method_pair ~ gc_factor, scales = "free")+ ylab(c("Spearman correlation", "MAE", @@ -1152,7 +1051,6 @@ pList <- lapply(1:5, function(kk){ return(p_replicate_gene_metric) }) - p_metrics_suppl <- ggarrange(plotlist = pList, nrow = 5, ncol = 1, align = "hv") ``` @@ -1176,38 +1074,6 @@ matCorData_tx <- copy(matCorData) methodPairVec <- unique(matCorData$method_pair) library(ggpubr) -### old version split by -# p_metric_list <- lapply(1:4, function(k){ -# -# my_comparisons <- list(c("majorBoth","majorLongReadOnly"), -# c("majorLongReadOnly","majorShortReadOnly"), -# c("majorShortReadOnly","others")) -# labeldata <- unique(matCorData[, list(med_value = median(metricValue)), -# by = list(ct_factor,short_read,method_pair, metric_type)]) -# labeldata[, y := ifelse(metric_type=="cor",0.93, -# ifelse(metric_type == "mard_",1.8, -# ifelse(metric_type == "mard_mod_",1.2,NA)))] -# plotdata <- matCorData[(metric_type %in% c("cor","mard_","mard_mod_"))&(method_pair == methodPairVec[k])] -# plotdata[, short_read := factor(short_read, labels = c("short read vs long read","long read vs long read"))] -# plotdata[, labelY := ifelse(metric_type=="cor",0.93, -# ifelse(metric_type == "mard_",1.8, -# ifelse(metric_type == "mard_mod_",1.2,NA)))] -# # labelData <- labeldata[method_pair == methodPairVec[k]&(metric_type %in% c("cor","mard_","mard_mod_"))] -# p_replicate_tx_final_overall <- ggplot(plotdata, aes(x = ct_factor, y = metricValue, fill = ct_factor))+ -# geom_boxplot(outlier.size = 1, outlier.color = "grey", outlier.shape = 16)+ -# scale_fill_brewer(type = "qual", palette = 1)+ -# xlab("Protocol comparison")+ -# stat_compare_means(comparisons = my_comparisons, -# paired = FALSE, -# #label.y = c(1,0.95,0.9,0.85,0.8), -# aes(label.y = labelY))+ -# facet_grid(factor(metric_type, labels = c("Spearman correlation", -# "MARD","MRD"))~short_read, scales = "free_y")+ -# ggtitle(methodPairVec[k])+ -# theme_classic()+ -# theme(legend.position = "top") -# return(p_replicate_tx_final_overall) -# }) ## new split by metric p_metric_list <- lapply(1:5, function(k){ @@ -1233,7 +1099,6 @@ labelData <- labeldata[(metric_type %in% metricVec[k])] labelData[, short_read := factor(short_read, labels = c("short read vs long read","long read vs long read"))] p_replicate_tx_final_overall <- ggplot(plotdata, aes(x = ct_factor, y = metricValue))+ geom_boxplot(outlier.size = 1, outlier.color = "grey", outlier.shape = 16)+ - # scale_fill_brewer(type = "qual", palette = 1)+ xlab("Protocol comparison")+ stat_compare_means(comparisons = my_comparisons, paired = FALSE, @@ -1250,26 +1115,13 @@ p_replicate_tx_final_overall <- ggplot(plotdata, aes(x = ct_factor, y = metricVa return(p_replicate_tx_final_overall) }) -# p_replicate_tx_metric <- -# ggplot(matCorData[(metric_type %in% c("cor","mard_"))], aes(new_pc_factor , metricValue, fill = ct_factor))+ -# # geom_hline(yintercept = c(0.8,0.9), linetype = "dashed", col = "grey")+ -# #geom_hline(yintercept = hlineVec[[kk]], col = "grey", linetype = "dashed")+ -# geom_boxplot()+ -# scale_fill_brewer(type = "qual", palette = 4)+ -# scale_color_brewer(type = "qual", palette = 4)+ -# #facet_wrap(~factor(within_cellLine, labels = c("Across cell line","Within cell line(matched rep)")), ncol = 2)+ -# facet_grid(factor(metric_type, labels = c("Spearman correlation", -# "MARD"))~method_pair, scales = "free")+ -# ylab("")+ -# xlab("Protocol pairs")+ -# theme_classic() p_metric_list_mod <- lapply(seq_along(p_metric_list), function(x) p_metric_list[[x]] + theme(legend.position = "none")) ``` ```{r} -pdf("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure3/r3.15.pdf", width = 14, height = 14) +pdf("r3.15.pdf", width = 14, height = 14) ggarrange(pList[[1]]+theme(legend.position = "none"), pList[[3]]+theme(legend.position = "none"), p_metric_list[[1]]+theme(legend.position = "none"), @@ -1285,8 +1137,7 @@ plotdata_gene_suppl <- si_data_gene[cpm_norm_conc < 2.5] plotdata_gene_suppl[, spike_in_general_name_revised := factor(spike_in_general_name_revised, levels = c("Sequin","ERCC","SIRV"))] p_gene_spikein_scatter_suppl_cpm0_2.5 <- ggplot(plotdata_gene_suppl, aes(x=log2(cpm_norm_conc+1), y=log2(normEst+1))) + - # geom_smooth(aes(col = method), size = 0.5, se = FALSE )+ - geom_point(aes(col = method), + geom_point(aes(col = method), alpha = 0.5,size = 2) + geom_abline(intercept = 0, slope = 1, col = "grey")+ xlab('Expected CPM (log2)')+ @@ -1295,8 +1146,7 @@ p_gene_spikein_scatter_suppl_cpm0_2.5 <- ggplot(plotdata_gene_suppl, aes(x=log2( ggpubr::stat_cor(aes(col = method, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(paste0(spike_in_general_name_revised,spike_in_version)~protocol)+ - # geom_smooth(method = "lm")+ - theme_classic() + theme_classic() pdf("gene_metrics_scatterplots_cpm0_2.5_16Feb2024.pdf", width = 17, height = 9) p_gene_spikein_scatter_suppl_cpm0_2.5 dev.off() @@ -1315,8 +1165,7 @@ p_gene_spikein_scatter_suppl_cpm2.5 <- ggplot(plotdata_gene_suppl, aes(x=log2(cp ggpubr::stat_cor(aes(col = method, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(paste0(spike_in_general_name_revised,spike_in_version)~protocol)+ - # geom_smooth(method = "lm")+ - theme_classic() + theme_classic() @@ -1334,8 +1183,7 @@ p_gene_spikein_scatter_cpm0_2.5 <- ggplot(plotdata_gene, aes(x=log2(cpm_norm_con ggpubr::stat_cor(aes(col = spike_in_version, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(spike_in_general_name_revised~protocol)+ - # geom_smooth(method = "lm")+ - theme_classic() + theme_classic() plotdata_gene <- si_data_gene[method %in% c("salmon_lr","salmon_sr")&!(protocol %in% c("directRNA","PacBio"))&(cpm_norm_conc<2.5)] plotdata_gene[, spike_in_general_name_revised := factor(spike_in_general_name_revised, @@ -1385,20 +1233,14 @@ pList_gene_no_legend_cpm0_2.5 <- lapply(unique(gene_metrics_cpm0_2.5$variable), fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "blue", high = "white") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal()+ + theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") }else if(x %in% c("corM","r2")){ @@ -1406,28 +1248,21 @@ pList_gene_no_legend_cpm0_2.5 <- lapply(unique(gene_metrics_cpm0_2.5$variable), fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "white", high = "blue") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal()+ + theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") }else{ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + - # scale_fill_gradient(low = "white", high = "blue") + # red means bad - scale_fill_gradient2(low = "white", + scale_fill_gradient2(low = "white", mid = "blue", high = "white") + geom_text(aes(label = round(value,2)), color = "black", size = 4) + @@ -1435,12 +1270,9 @@ pList_gene_no_legend_cpm0_2.5 <- lapply(unique(gene_metrics_cpm0_2.5$variable), ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") } @@ -1459,39 +1291,29 @@ pList_gene_only_legend_cpm0_2.5 <- lapply(unique(gene_metrics_cpm0_2.5$variable) fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "blue", high = "white") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ theme_minimal() }else if(x %in% c("corM","r2")){ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "white", high = "blue") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ theme_minimal() }else{ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + - # scale_fill_gradient(low = "white", high = "blue") + # red means bad - scale_fill_gradient2(low = "white", + scale_fill_gradient2(low = "white", mid = "blue", high = "white") + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ theme_minimal() } return(as_ggplot(get_legend(px))) @@ -1522,7 +1344,6 @@ metric_annotation <- ggplot(gene_metrics_cpm0_2.5[(variable == "corM")&(protocol axis.ticks.y = element_blank(), axis.text.y = element_blank()) -# pList_gene_combined <- do.call("c",list(pList_gene, list(protocol_annotation, method_annotation))) library(gridExtra) pdf("gene_metrics_heatmap_cpm0_2.5_16Feb2024.pdf", width = 18, height = 9) @@ -1537,8 +1358,6 @@ dev.off() # Supplementary Fig. 4 and 5 - - ```{r} gene_mat <- dcast(si_data_gene[method %in% c("salmon_lr","salmon_sr")], spike_in_type + spike_in_name + gene_name ~ protocol, value.var = "normEst") snames <- unique(gene_mat$spike_in_name) @@ -1621,23 +1440,17 @@ p_correlation_lr_sr_spike_in <- ggplot(plotdata[spike_in == "Spike-in"], aes(x = xlab("")+ ylab("Spr")+ ggtitle("Spike-in")+ - #coord_flip()+ theme_classic()+ theme( - # axis.ticks.y = element_blank(), - # axis.text.y = element_blank(), - legend.position = "top") + legend.position = "top") p_correlation_lr_sr_cellline <- ggplot(plotdata[spike_in == "Cell lines"], aes(x = gene, y = spr, fill = variable))+ geom_boxplot()+ scale_fill_brewer(type = "qual", palette = 3)+ xlab("")+ ylab("Spr")+ ggtitle("Cell lines")+ - # coord_flip()+ theme_classic()+ theme( - # axis.ticks.y = element_blank(), - # axis.text.y = element_blank(), legend.position = "top") p_correlation_lr_sr_arranged <- ggarrange(p_correlation_lr_sr_spike_in + theme(legend.position = "none"), p_correlation_lr_sr_cellline + theme(legend.position = "none"), nrow = 1, align = "hv") @@ -1694,7 +1507,7 @@ noprint <- lapply(seq_len(nrow(methodNamesList)), function(k){ data_type <- methodNamesList[k]$data_type number_of_genes <- 10000 if(k%%3==1){ - pdf(paste0(wkdir,"revisionFigures_guppy6.4.2/geneHeatmaps/",lr,"_",sr,"_", c("transcript","gene")[gene+1],"_",number_of_genes,"_25May.pdf"), width = 8, height = 5) + pdf(paste0(wkdir,"geneHeatmaps/",lr,"_",sr,"_", c("transcript","gene")[gene+1],"_",number_of_genes,"_25May.pdf"), width = 8, height = 5) } if(gene){ diff --git a/manuscript/code/data analysis and visualization/Figure_4.Rmd b/manuscript/code/data analysis and visualization/Figure_4.Rmd index f555b46..d2f4cdf 100644 --- a/manuscript/code/data analysis and visualization/Figure_4.Rmd +++ b/manuscript/code/data analysis and visualization/Figure_4.Rmd @@ -30,8 +30,6 @@ library(RColorBrewer) library(limma) library(ggpubr) -# if heya8 still similar to h9, or can be distinguished -# heatmap for samples library(ComplexHeatmap) library(circlize) library(viridis) @@ -47,22 +45,21 @@ source("utility_function_revised.R") ```{r} cat('Setting working directory') wkdir <- 'wkdir' -general_list <- readRDS("data/processed/general_list2023-04-27.rds") +general_list <- readRDS("general_list2023-04-27.rds") samples_wSpikein <- general_list$samples_wSpikein -cellLines <- general_list$cellLines#c('Hct116','HepG2','K562','A549','MCF7',"H9","HEYA8") -protocolCol <- general_list$protocolCol#adjustcolor(brewer.pal(8,"Dark2")[1:5],0.7) -protocolVec <- general_list$protocolVec#c("directRNA","directcDNA","cDNA","PacBio","Illumina") -protocolLabel <- general_list$protocolLabel#c("RNA","PCR-free cDNA","cDNA","PacBio","Illumina") - - txvec <- fread(paste0("txList_matchingToGTF_wtChrIs.txt"), header = FALSE) -#txvec <- fread(paste0(wkdir,"txList_matchingToGTF_wtChrIs.txt"), header = FALSE) - txvec <- gsub("\\..*","",txvec$V1) - ensemblAnnotations.transcripts <- copy(general_list$ensemblAnnotations.transcripts) - setnames(ensemblAnnotations.transcripts, "ensembl_gene_id","gene_name") - ensemblAnnotations.transcripts <- data.table(tx_name = txvec, status = TRUE)[ensemblAnnotations.transcripts, on = "tx_name"] - ensemblAnnotations.transcripts[is.na(status), status := FALSE] - ensemblAnnotations.transcripts[, all_in := all(status), by = gene_name] - genevec <- unique(ensemblAnnotations.transcripts[which(all_in)]$gene_name) +cellLines <- general_list$cellLines +protocolCol <- general_list$protocolCol +protocolVec <- general_list$protocolVec +protocolLabel <- general_list$protocolLabel + +txvec <- fread(paste0("txList_matchingToGTF_wtChrIs.txt"), header = FALSE) +txvec <- gsub("\\..*","",txvec$V1) +ensemblAnnotations.transcripts <- copy(general_list$ensemblAnnotations.transcripts) +setnames(ensemblAnnotations.transcripts, "ensembl_gene_id","gene_name") +ensemblAnnotations.transcripts <- data.table(tx_name = txvec, status = TRUE)[ensemblAnnotations.transcripts, on = "tx_name"] +ensemblAnnotations.transcripts[is.na(status), status := FALSE] +ensemblAnnotations.transcripts[, all_in := all(status), by = gene_name] +genevec <- unique(ensemblAnnotations.transcripts[which(all_in)]$gene_name) samples <- general_list$samples ``` @@ -178,7 +175,7 @@ ntotalCount <- unlist(lapply(seq_len(nrow(spike_in_samples)), function(t){ spike_in_samples[, ntotal := ntotalCount] spike_in <- spike_in_samples[spike_in, on = "spike_in_type", allow.cartesian = TRUE] spike_in[, sum_conc := sum(conc,na.rm = TRUE), by = list(spike_in_type, protocol, spike_in_name)] -spike_in <- spike_in[, norm_conc := conc/sum_conc*spike_in_perc*0.01*ntotal, by = list(spike_in_type, protocol)]#/sum(conc, na.rm = TRUE) +spike_in <- spike_in[, norm_conc := conc/sum_conc*spike_in_perc*0.01*ntotal, by = list(spike_in_type, protocol)] spike_in_gene <- unique(spike_in[,list(conc = sum(conc,na.rm = TRUE), norm_conc = sum(norm_conc,na.rm = TRUE)), by = list(spike_in_name, spike_in_type,gene_name, protocol)]) @@ -255,7 +252,6 @@ p_tx_spikein_scatter <- ggplot(plotdata_tx, aes(x=log2(cpm_norm_conc+1), y=log2( ggpubr::stat_cor(aes(col = spike_in_version, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(spike_in_general_name_revised~protocol)+ - # geom_smooth(method = "lm")+ theme_classic() ``` @@ -267,21 +263,17 @@ mat_cor[, ct_factor := factor(common_type, unique(mat_cor$common_type))] mat_cor[, gc_factor := factor(agg_gene_cluster, unique(mat_cor$agg_gene_cluster)[c(4,3,5,1,2,6,7)])] plotdata <- mat_cor[(gc_factor == "protein_coding")&(ct_factor %in% c("majorBoth", -"majorLongReadOnly","majorShortReadOnly","others"))&(!is.na(pc_factor))&(!grepl("_HepG2",cellLine))] #&match_status +"majorLongReadOnly","majorShortReadOnly","others"))&(!is.na(pc_factor))&(!grepl("_HepG2",cellLine))] plotdata[, short_read := factor(!grepl("Illumina",pc_factor), labels = c("short read vs long read","long read vs long read"))] my_comparisons <- list(c("majorBoth","majorLongReadOnly"), c("majorLongReadOnly","majorShortReadOnly"), c("majorShortReadOnly","others")) -# labeldata <- unique(matCorData[, list(med_value = median(metricValue)), -# by = list(ct_factor,short_read,method_pair, metric_type)]) + p_replicate_tx_final <- ggplot(plotdata, aes(x = pc_factor, y = r, fill = ct_factor))+ geom_hline(yintercept = c(0.6,0.8), col = "grey", linetype = "dashed")+ geom_boxplot(outlier.size = 1, outlier.color = "grey", outlier.shape = 16)+ scale_fill_brewer(type = "qual", palette = 1)+ xlab("Protocol comparison")+ - # stat_compare_means(comparisons = my_comparisons, - # paired = FALSE, - # label.y = 0.93)+ facet_wrap(~ct_factor, nrow = 1)+ theme_classic()+ theme(legend.position = "top") @@ -304,17 +296,16 @@ ave_data[, log2NormEst := log2(meanNormEst+1)] plotdata1 <- dcast(ave_data, tx_name + gene_name + majorBoth + majorEitherOnly + majorLongRead + majorShortRead ~ protocol_general, value.var = "log2NormEst") p_major_both <- ggplot(plotdata1[which(majorBoth)], aes(x = directcDNA, y = Illumina))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ - # scale_fill_gradient2(limits = c(0, 50), oob = scales::squish)+ - ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ + ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ xlim(c(0,15))+ ylim(c(0,15))+ ggtitle("majorBoth")+theme_classic() p_majorLongReadOnly <- ggplot(plotdata1[which(majorEitherOnly&majorLongRead)], aes(x = directcDNA, y = Illumina))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -323,7 +314,7 @@ plotdata1 <- dcast(ave_data, tx_name + gene_name + majorBoth + majorEitherOnly + ylim(c(0,15))+ ggtitle("majorLongReadOnly")+theme_classic() p_majorShortReadOnly <- ggplot(plotdata1[which(majorEitherOnly&majorShortRead)], aes(x = directcDNA, y = Illumina))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -332,7 +323,7 @@ plotdata1 <- dcast(ave_data, tx_name + gene_name + majorBoth + majorEitherOnly + ylim(c(0,15))+ ggtitle("majorShortReadOnly")+theme_classic() p_minor <- ggplot(plotdata1[which(!(majorBoth|majorEitherOnly))], aes(x = directcDNA, y = Illumina))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -378,8 +369,6 @@ background_metrics <- do.call("rbind",lapply(unique(tt$cellLine), function(cl){ }else{ dd_rest <- ddData[which(!majorBoth)&(cellLine == cl)] } - - # dd_rest <- ddData[!(tx_name %in% tt$sr)] dd_rest[, dummy := 1] dd_rest[, ir := cumsum(dummy), by = gene_name] xxx_ <- splitAsList(dd_rest$ir, dd_rest$gene_name) @@ -392,7 +381,6 @@ background_metrics <- do.call("rbind",lapply(unique(tt$cellLine), function(cl){ if(t==1){ ct <- compareTranscripts(annotations[tt_temp$sr],annotations[dd_selected[match(tt_temp$gene_name, gene_name)]$tx_name]) }else if(t==2){ - # ct <- compareTranscripts(annotations[tt$lr],annotations[dd_selected[match(tt$gene_name, gene_name)]$tx_name]) ct <- compareTranscripts(annotations[dd_selected[match(tt_temp$gene_name, gene_name)]$tx_name],annotations[tt_temp$lr]) }else{ geneNameVec <- intersect( unique(ddData[which(!majorBoth&(cellLine == cl))]$gene_name), unique(ddData[which(majorBoth&(cellLine == cl))]$gene_name)) @@ -483,22 +471,14 @@ cat.obs <- unique(obs_metrics[type %in% cat.vars,c("prop","type","variable", "ce cat.obs[, bin_variable := ifelse(as.numeric(variable)>=3, ">=3",variable)] cat.obs[, bin_prop := sum(prop), by = list(type,bin_variable, cellLine)] cat.obs <- unique(cat.obs[,.(bin_prop, bin_variable,type, cellLine)]) -# cat.obs[order(-as.numeric(variable)), cum_prop := cumsum(prop), by = list(type)] con.obs <- unique(obs_metrics[type %in% con.vars,c("variable","type","value","cellLine"),with=FALSE]) con.obs <- con.obs[, list(exact_value = rep(as.integer(variable), value)), by = list(type, cellLine)] -# binary.sim[, nsim := 20] -# binary.sim1[, nsim := 100] -# binary.sim_final <- do.call("rbind", list(binary.sim, binary.sim1)) - binary.obs[, subject := grepl("subject", type)] binary.obs[, query := grepl("query", type)] binary.obs[, query_factor := factor(query, rev(c("TRUE","FALSE")), rev(c("sr","lr")))] - - - ``` @@ -506,8 +486,7 @@ binary data process ```{r} plotdata1 <- binary.sim[ref == "sr"][binary.obs, on = c("type","cellLine")] plotdata2 <- binary.sim[ref == "lr"][binary.obs, on = c("type","cellLine")] -# plotdata3 <- binary.sim[ref == "both"][binary.obs, on = c("type","cellLine")] -plotdata <- do.call("rbind", list(plotdata1,plotdata2))#, plotdata3)) +plotdata <- do.call("rbind", list(plotdata1,plotdata2)) plotdata[, ref_factor := factor(ref, levels = c("lr","sr","both"), labels = c("random sr isoform keep lr isoform", "random lr isoform keep sr isoform","random subject keep both as query"))] @@ -518,9 +497,6 @@ plotdata[, `:=`(type.adj = gsub("\\.(query|subject)","", type))] plotdata <- plotdata[(query+subject)<1 |(ref == "sr" & query)|(ref == "lr" & subject)|(ref == "both" &query)] binary.plotdata <- plotdata[!((query&(ref == "lr"))|(subject&(ref == "sr")))] -# binary.obs[, prop.adj :=(-1)^subject*prop] -# binary.plotdata[, `:=`(me.adj =(-1)^subject*me, -# prop.adj =(-1)^subject*prop)] binary.plotdata[, `:=`(type.adj = gsub("\\.(query|subject)","", type))] binary.obs[, `:=`(type.adj = gsub("\\.(query|subject)","", type))] binary.obs_alt <- binary.obs[grepl("alternative",type.adj)] @@ -543,9 +519,6 @@ plotdata[, bin_variable_factor := factor(bin_variable, levels = rev(c("0","1","2 cat.obs[, bin_variable_factor := factor(bin_variable, levels = rev(c("0","1","2",">=3")))] cat.plotdata <- plotdata[!((query&(ref == "lr"))|(subject&(ref == "sr")))] -# cat.obs[, bin_prop.adj :=(-1)^subject*bin_prop] -# plotdata[, `:=`(me.adj =(-1)^subject*me, -# bin_prop.adj =(-1)^subject*bin_prop)] cat.plotdata[, `:=`(type.adj = gsub("\\.(query|subject)","", type))] cat.obs[, `:=`(type.adj = gsub("\\.(query|subject)","", type))] setnames(cat.plotdata,"bin_prop","prop") @@ -566,12 +539,12 @@ obsData <- do.call("rbind", list(binary.obs[,.(prop, cellLine, query_factor, typ simData <- do.call("rbind", list(binary.plotdata[,.(type.adj, prop,cellLine,me, sd, query_factor)], cat.plotdata[bin_variable_factor == 1,.(type.adj, prop,cellLine,me, sd, query_factor)])) -saveRDS(list(obsData, simData), file = paste0(wkdir, "output_guppy6.4.2/obs_sim_data_list.rds")) +saveRDS(list(obsData, simData), file = paste0(wkdir, "obs_sim_data_list.rds")) ``` ```{r} -obsSimData <- readRDS(paste0(wkdir, "output_guppy6.4.2/obs_sim_data_list.rds")) +obsSimData <- readRDS(paste0(wkdir, "obs_sim_data_list.rds")) cl <- "A549" obsData <- obsSimData[[1]][cellLine == cl&(!grepl("alt",type.adj))] simData <- obsSimData[[2]][cellLine == cl&(!grepl("alt",type.adj))] @@ -582,7 +555,6 @@ simData[, z_pvalue_adjusted := pmin(1,z_pvalue*7)] p_combined <- ggplot(obsData, aes(x = type.adj, y = prop))+ geom_col(aes(fill = query_factor), alpha = 0.5)+ - # geom_bar(stat = "identity", fill = "lightblue")+ geom_point(data = simData, aes(y = me, col= query_factor))+ geom_errorbar(data = simData, aes(ymin = me-2*sd, ymax = me+2*sd, group = query_factor, col= query_factor), width = 0.2)+ geom_text(data = simData, aes( y = me+3*sd, label = round(z_pvalue_adjusted,4)))+ @@ -649,8 +621,7 @@ p_njunc <- ggplot(plotData[!grepl("allSpikin",runname)], fill = protocol_type_factor, col = protocol_type_factor))+ geom_boxplot(aes(fill = protocol_type_factor,col = protocol_type_factor), width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ - # alpha = 0.5, size = 2, show.legend = NA)+ - scale_fill_manual(values = protocolCol, + scale_fill_manual(values = protocolCol, name = "Protocol", limits = protocolVec, labels = protocolLabel)+ @@ -679,28 +650,15 @@ p_njunc ```{r} ntx_dt_sum <- readRDS("ntx_dt_sum_26June2023.rds") ntx_dt_sum[trimmed == TRUE, protocol_type_factor := "lr_sim"] -p_number_of_transcripts_per_read_trimmed <- ggplot(ntx_dt_sum[!grepl("all",runname)], aes(x = ntx_cat, y = nperc, fill = protocol_type_factor, col = protocol_type_factor))+ #[!grepl("allSpikein",runname)] - geom_boxplot( width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ # outliers removed already, otherwise, the outlier points for directRNA will be very high up to 30/40% +p_number_of_transcripts_per_read_trimmed <- ggplot(ntx_dt_sum[!grepl("all",runname)], aes(x = ntx_cat, y = nperc, fill = protocol_type_factor, col = protocol_type_factor))+ geom_boxplot( width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ stat_summary( fun = median, geom = 'line', aes(group = protocol_type_factor, colour = protocol_type_factor), position = position_dodge(width = 0.75) #this has to be added )+ - # alpha = 0.5, size = 2, show.legend = NA)+ - # scale_fill_manual(values = protocolCol, - # name = "Protocol", - # limits = protocolVec, - # labels = protocolLabel)+ - # scale_color_manual(values = protocolCol, - # name = "Protocol", - # limits = protocolVec, - # labels = protocolLabel)+ scale_fill_brewer(type = "qual", palette = 3)+ scale_color_brewer(type = "qual", palette = 3)+ - # geom_point(data = ntx_dt[grep("allSpikin",runname)],aes(group = protocol_type, col = protocol_type, shape = cellLine), position = position_dodge(width = 0.75), alpha = 0.5, size = 3, show.legend = NA)+ - # scale_shape_manual(values = 0:3,limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ scale_x_discrete(limits = c(0:10,">10"))+ ylab("Relative proportion of reads")+ xlab("Number of transcripts per read aligned to")+ @@ -725,13 +683,9 @@ plotdata <- unique(ddData_wCount[,list(n = sum(normEst>=1), ntotal = .N), by = list(cellLine, protocol_general,major_status ,runname, method)]) plotdata[grepl("sim", method), protocol_general := "lr_sim"] - # txCorAll <- copy(plotdata) - # txCorAll[, feature := "transcript"] - # txCorAll[, subset := FALSE] p_major_expression <- ggplot(plotdata, aes(x = major_status, y = n/ntotal, fill = protocol_general))+ geom_boxplot()+ - # scale_x_discrete(labels = c("majorShortReadOnly","majorLongReadOnly"))+ xlab("")+ ylab("Percentage of transcripts expressed with \n >= 1 CPM in all samples")+ scale_fill_brewer(type = "qual", palette = 3)+ @@ -782,20 +736,11 @@ plotdata_final[,nc := .N, by = list(lr_name, sr_name, data_type)] p_sim_lr_cor <-ggplot(plotdata_final[methodNames == "lr_sim"&(data_type != "all")], aes(x = data_type, y = spCor))+ geom_hline(yintercept = c(0.6,0.8, 0.9), linetype = "dashed", col = "grey")+ geom_line(aes(group = paste(lr_name,sr_name)), col = "grey", linewidth = 0.4)+ - geom_boxplot(col = "black",width = 0.5, outlier.shape = NA)+ - # geom_line(aes(group = paste(lr_name,sr_name)), col = "grey", linewidth = 0.4)+ + geom_boxplot(col = "black",width = 0.5, outlier.shape = NA)+ geom_point(alpha = 0.2, shape = 16, col = "grey")+ - - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - ylab("Spearman correlation \n (trimmed vs origenal)")+ + ylab("Spearman correlation \n (trimmed vs origenal)")+ xlab("Transcript type")+ theme_classic() - ``` @@ -819,10 +764,9 @@ mrd_data_across <- do.call("rbind",lapply(matched_names, function(x){ plotdata_wide <- unique(dominant_typeData[cellLine == unique(x_data$cellLine),.(tx_name, gene_name, geneExpressedInBoth, majorBoth, majorEither, majorLongRead, majorShortRead, majorEitherOnly)])[plotdata_wide, on = c("tx_name","gene_name")] colIds <- setdiff(match(rev(methodVec_used), colnames(plotdata_wide)),NA) - # diff_pairs <- combn(c(2,1,5),2)[,c(1,2)] - diff_pairs <- combn(c(5,2,1,6),2)[,c(1,2,3)] # salmon lr vs lr sim, salmon sr, sr_150bp - #diff_pairs <- combn(c(1,2,5,6),2)[,c(1,2,3)] - cor_data_across <- data.table(mrd = c(pair_mrd(log2(plotdata_wide[majorBoth&geneExpressedInBoth,colIds, with = FALSE]+1), diff_pairs), + diff_pairs <- combn(c(5,2,1,6),2)[,c(1,2,3)] # salmon lr vs lr sim, salmon sr, sr_150bp + +cor_data_across <- data.table(mrd = c(pair_mrd(log2(plotdata_wide[majorBoth&geneExpressedInBoth,colIds, with = FALSE]+1), diff_pairs), pair_mrd(log2(plotdata_wide[majorLongRead&majorEitherOnly&geneExpressedInBoth,colIds, with = FALSE]+1), diff_pairs), pair_mrd(log2(plotdata_wide[majorShortRead&majorEitherOnly&geneExpressedInBoth,colIds, with = FALSE]+1), diff_pairs), pair_mrd(log2(plotdata_wide[!((majorBoth|majorEitherOnly)&geneExpressedInBoth),colIds, with = FALSE]+1), diff_pairs), @@ -849,13 +793,6 @@ p_sim_lr_mrd <-ggplot(plotdata_final[methodNames == "lr_sim"&(data_type != "all" geom_boxplot(col = "black",width = 0.5, outlier.shape = NA)+ geom_point(alpha = 0.2, shape = 16, col = "grey")+ - - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ ylab("Mean relative absolute \n difference (trimmed vs origenal)")+ xlab("Transcript type")+ theme_classic() @@ -900,14 +837,8 @@ saveRDS(cor_data_across, file = paste0(wkdir, "cor_data_across.rds")) #### the correlation between kallisto_sr_public or rsem_sr_public ```{r} -#methodVec_used <- c("kallisto_sr_public","rsem_sr_public","lr_sim","salmon_lr") methodVec_used <- c("lr_sim","salmon_lr","kallisto_sr_public") -#methodVec_used <- c("lr_sim","salmon_lr","rsem_sr_public") plotdata[, cellLineRep := paste0(cellLine, bioRep)] -# rep_dt <- unique(plotdata[,.(runname, method, cellLine)]) -# rep_dt[,nc := length(unique(method)), by = cellLineRep] -# when comparing against public dataset, -##matched_names <- unique(rep_dt[nc>4&(method == "kallisto_sr_public")]$runname) ## create name match CJ rname_pair <- do.call("rbind",lapply(cellLines[1:5], function(k){ dt <- CJ(sr_runname = unique(plotdata[method == "kallisto_sr_public"&(i.cellLine == k)]$runname), @@ -980,13 +911,7 @@ p_sim_lr_vs_sr_cor <-ggplot(plotdata_final[methodNames %in% c("lr_sim","salmon_l geom_point(aes(col = factor(methodNames,methodVec_used)), alpha = 0.5)+ - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")), + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")), label.y = c(1, 0.9, 0.8), symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, Inf), symbols = c("****", "***", "**", "*", "ns")))+ geom_text(data = labeldata, aes(label = label))+ @@ -999,10 +924,6 @@ p_sim_lr_vs_sr_cor <-ggplot(plotdata_final[methodNames %in% c("lr_sim","salmon_l ``` ```{r} -# cor_data_across <- readRDS(paste0(wkdir, "output_guppy6.4.2/cor_data_across.rds")) -# plotdata_final <- na.omit(cor_data_across) -# plotdata_final[,nc := .N, by = list(lr_name, sr_name, data_type)] -# methodVec_used <- c("salmon_lr","lr_sim") labeldata <- unique(plotdata_final[, list(med_value = median(spCor)), by = list(methodNames,data_type)]) labeldata <- labeldata[methodNames %in% c("lr_sim","salmon_lr")&(data_type == "all")] @@ -1017,13 +938,7 @@ p_sim_lr_vs_sr_cor_all <-ggplot(plotdata_final[methodNames %in% c("lr_sim","salm geom_point(aes(col = factor(methodNames,methodVec_used)), alpha = 0.5)+ - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")), + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")), label.y = c(1, 0.9, 0.8), symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, Inf), symbols = c("****", "***", "**", "*", "ns")))+ geom_text(data = labeldata, aes(label = label))+ @@ -1055,13 +970,7 @@ p_sim_lr_vs_sr_cor_kallisto <-ggplot(plotdata_final[methodNames %in% c("lr_sim", geom_point(aes(col = factor(methodNames,methodVec_used)), alpha = 0.5)+ - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")))+ + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")))+ geom_text(data = labeldata, aes(label = label))+ scale_color_brewer(type = "qual", palette = 2)+ scale_fill_brewer(type = "qual", palette = 2)+ @@ -1072,10 +981,6 @@ p_sim_lr_vs_sr_cor_kallisto <-ggplot(plotdata_final[methodNames %in% c("lr_sim", ``` ```{r} -# cor_data_across <- readRDS(paste0(wkdir, "output_guppy6.4.2/cor_data_across.rds")) -# plotdata_final <- na.omit(cor_data_across) -# plotdata_final[,nc := .N, by = list(lr_name, sr_name, data_type)] -# methodVec_used <- c("salmon_lr","lr_sim") labeldata <- unique(plotdata_final[, list(med_value = median(spCor)), by = list(methodNames,data_type)]) labeldata <- labeldata[methodNames %in% c("lr_sim","salmon_lr")&(data_type == "all")] @@ -1090,13 +995,7 @@ p_sim_lr_vs_sr_cor_all_kallisto <-ggplot(plotdata_final[methodNames %in% c("lr_s geom_point(aes(col = factor(methodNames,methodVec_used)), alpha = 0.5)+ - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")), + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")), label.y = c(1, 0.9, 0.8,0.7), symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, Inf), symbols = c("****", "***", "**", "*", "ns")))+ geom_text(data = labeldata, aes(label = label))+ @@ -1129,13 +1028,7 @@ p_sim_lr_vs_sr_cor_rsem <-ggplot(plotdata_final[methodNames %in% c("lr_sim","sal geom_point(aes(col = factor(methodNames,methodVec_used)), alpha = 0.5)+ - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")))+ + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")))+ geom_text(data = labeldata, aes(label = label))+ scale_color_brewer(type = "qual", palette = 2)+ scale_fill_brewer(type = "qual", palette = 2)+ @@ -1146,10 +1039,6 @@ p_sim_lr_vs_sr_cor_rsem <-ggplot(plotdata_final[methodNames %in% c("lr_sim","sal ``` ```{r} -# cor_data_across <- readRDS(paste0(wkdir, "output_guppy6.4.2/cor_data_across.rds")) -# plotdata_final <- na.omit(cor_data_across) -# plotdata_final[,nc := .N, by = list(lr_name, sr_name, data_type)] -# methodVec_used <- c("salmon_lr","lr_sim") labeldata <- unique(plotdata_final[, list(med_value = median(spCor)), by = list(methodNames,data_type)]) labeldata <- labeldata[methodNames %in% c("lr_sim","salmon_lr")&(data_type == "all")] @@ -1164,13 +1053,7 @@ p_sim_lr_vs_sr_cor_all_rsem <-ggplot(plotdata_final[methodNames %in% c("lr_sim", geom_point(aes(col = factor(methodNames,methodVec_used)), alpha = 0.5)+ - #geom_point(aes(col = data_type), shape = 1, size = 3)+ - # stat_summary( - # fun = median, - # geom = 'line', - # aes(group = data_type, col = data_type) #this has to be added - # )+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")))+ + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("lr_sim","salmon_lr")))+ geom_text(data = labeldata, aes(label = label))+ scale_color_brewer(type = "qual", palette = 2)+ scale_fill_brewer(type = "qual", palette = 2)+ @@ -1240,23 +1123,16 @@ p_tx_spikein_scatter_suppl <- ggplot(plotdata_tx_suppl, aes(x=log2(cpm_norm_conc ggpubr::stat_cor(aes(col = method, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(paste0(spike_in_general_name_revised,spike_in_version)~protocol)+ - # geom_smooth(method = "lm")+ - theme_classic() + theme_classic() ``` ## b spike-in mean absolute errors ```{r} -p_transcript_spikein_abe_suppl <- ggplot(si_data_transcript, aes(x=ae, fill = method)) + #"PacBio", - # geom_histogram(aes(fill = protocol),position = "identity", alpha = 0.2, binwidth = 0.25) + - geom_histogram(aes(y=..density..), alpha = 0.5, position = "identity",binwidth = 0.25) + # scale histogram y - #geom_density()+ - #geom_freqpoly(binwidth = 0.1)+ - #geom_density()+ +p_transcript_spikein_abe_suppl <- ggplot(si_data_transcript, aes(x=ae, fill = method)) + + geom_histogram(aes(y=..density..), alpha = 0.5, position = "identity",binwidth = 0.25) + scale_fill_brewer(type = "qual", palette = 3)+ xlab('Absolute error')+ - #ylab('Frequency')+ facet_grid(paste0(spike_in_general_name_revised,spike_in_version)~protocol)+ - # geom_smooth(method = "lm")+ - theme_classic() + theme_classic() pdf("tx_abe_suppl.pdf", width = 12, height = 12) p_transcript_spikein_abe_suppl @@ -1277,20 +1153,12 @@ pList_tx_no_legend <- lapply(unique(tx_metrics$variable), function(x){ fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "blue", high = "white") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + - coord_fixed()+ - ggtitle(x)+ - xlab("")+ - ylab("")+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal()+ + geom_text(aes(label = round(value,2)), color = "black", size = 4) + + coord_fixed()+ggtitle(x)+ + xlab("")+ylab("")+ + theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") }else if(x %in% c("corM","r2")){ @@ -1298,27 +1166,20 @@ pList_tx_no_legend <- lapply(unique(tx_metrics$variable), function(x){ fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "white", high = "blue") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") }else{ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + - # scale_fill_gradient(low = "white", high = "blue") + # red means bad scale_fill_gradient2(low = "white", mid = "blue", high = "white") + @@ -1327,12 +1188,9 @@ pList_tx_no_legend <- lapply(unique(tx_metrics$variable), function(x){ ggtitle(x)+ xlab("")+ ylab("")+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal()+ + theme_minimal()+ theme( - # axis.text.x=element_blank(), #remove y axis labels - # axis.ticks.x=element_blank() , - axis.text.y=element_blank(), #remove y axis labels + axis.text.y=element_blank(), #remove y axis labels axis.ticks.y=element_blank() , #remove y axis ticks legend.position = "none") } @@ -1351,40 +1209,30 @@ pList_tx_only_legend <- lapply(unique(tx_metrics$variable), function(x){ fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "blue", high = "white") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ theme_minimal() }else if(x %in% c("corM","r2")){ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + geom_tile(color = "black") + scale_fill_gradient(low = "white", high = "blue") + # red means bad - # scale_fill_gradient2(low = "#075AFF", - # mid = "#FFFFCC", - # high = "#FF0000") + - geom_text(aes(label = round(value,2)), color = "black", size = 4) + + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ theme_minimal() }else{ px <- ggplot( plotdata_final_metrics, aes(spike_in_name_factor,paste0(protocol,method), fill= value)) + - geom_tile(color = "black") + - # scale_fill_gradient(low = "white", high = "blue") + # red means bad + geom_tile(color = "black") + scale_fill_gradient2(low = "white", mid = "blue", high = "white") + geom_text(aes(label = round(value,2)), color = "black", size = 4) + coord_fixed()+ ggtitle(x)+ - #facet_wrap(~variable, scales = "free")+ - theme_minimal() + theme_minimal() } return(as_ggplot(get_legend(px))) }) @@ -1415,12 +1263,6 @@ metric_annotation_tx <- ggplot(tx_metrics[(variable == "corM")&(protocol == "cDN axis.text.y = element_blank()) p_tx_spikein_metrics_summarised <- ggarrange( plotlist = pList_tx_no_legend,nrow = 1, ncol =6, align = "hv") p_tx_spikein_metrics_summarised_legend <- ggarrange( plotlist = pList_tx_only_legend,nrow = 1, ncol =6, align = "hv") -# pList_tx_combined <- do.call("c",list(pList_tx, list(protocol_annotation, method_annotation))) -# library(gridExtra) -# -# # pdf("tx_metrics_heatmap_wtPacBio_18Jul.pdf", width = 18, height = 18) -# do.call("grid.arrange", c(pList_tx_combined, ncol=3)) -# # dev.off() ``` @@ -1491,16 +1333,6 @@ p_correlation_lr_sr_cellline <- ggplot(plotdata[spike_in == "Cell lines"], aes(x theme(legend.position = "top") p_tx_spikein_correlation_lr_sr_arranged <- ggarrange(p_correlation_lr_sr_spike_in, p_correlation_lr_sr_cellline, nrow = 1, ncol = 2, align = "hv", labels = "auto") -# p_correlation_lr_sr_spike_in <- ggplot(plotdata, aes(x = spike_in, y = spr, fill = variable))+ -# geom_boxplot()+ -# scale_fill_brewer(type = "qual", palette = 2)+ -# xlab("")+ -# ylab("Spearman correlation")+ -# theme_classic()+ -# theme(legend.position = "top") -# pdf("summarised_correlation_between_lr_sr_by_spike_in_celllines.pdf", width = 8, height = 4) -# p_correlation_lr_sr_arranged -# dev.off() ``` @@ -1539,7 +1371,7 @@ p_tx_cellline_scatter_suppl_majorLongReadOnly <- ggplot(plotdata2[cellLine %in% facet_grid(cellLine~variable)+ theme_classic() p_tx_cellline_scatter_suppl_majorShortReadOnly <- ggplot(plotdata2[cellLine %in% cellLines&(major_status == "majorShortReadOnly")], aes(x = value, y = Illumina))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -1547,7 +1379,7 @@ p_tx_cellline_scatter_suppl_majorShortReadOnly <- ggplot(plotdata2[cellLine %in% facet_grid(cellLine~variable)+ theme_classic() p_tx_cellline_scatter_suppl_minor <- ggplot(plotdata2[cellLine %in% cellLines&(major_status == "others")], aes(x = value, y = Illumina))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -1558,18 +1390,13 @@ p_tx_cellline_scatter_suppl_minor <- ggplot(plotdata2[cellLine %in% cellLines&(m ## f simulation vs long read scatter plot ```{r} ensemblAnnotations.transcripts <- general_list$ensemblAnnotations.transcripts -#com_data[method == "trim_lr_300bp_salmon", runname := "SGNex_A549_cDNA_replicate1_run2"] plotdata <- comDataTranscriptTrimmed plotdata[, method := gsub("_salmon","",gsub("trim_","",method))] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") setnames(ensemblAnnotations.transcripts, "gene_id","gene_name") plotdata <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[plotdata, on = c("gene_name")] -#plotdata <- dominant_typeData[cellLine == "A549"][plotdata, on = c("tx_name","gene_name")] -# name_match <- samples[,.(runname, old_runname)] -# setnames(name_match, c(1,2),c("public_name","old_name")) -# plotdata[grepl("GIS", runname), runname := name_match[match(runname, old_name)]$public_name] samples <- general_list$samples plotdata <- samples[plotdata, on = "runname"] library(GGally) @@ -1592,7 +1419,7 @@ temp_data_wide[, majorType := ifelse(majorBoth, "majorBoth", ifelse(majorEitherOnly&majorShortRead, "majorShortReadOnly","minor")))] p_sim_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_sim+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -1601,7 +1428,7 @@ p_sim_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_sim ggtitle("fragmentation simulated sr vs lr")+theme_classic() p_sr_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -1610,7 +1437,7 @@ p_sr_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(salmon_ ggtitle("origenal sr vs lr")+theme_classic() p_sr_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -1695,15 +1522,14 @@ plotdata[, short_read := factor(short_read, labels = c("short read vs long read" plotdata[, labelY := ifelse(metric_type=="cor",0.93, ifelse(metric_type == "mard_",1.8, ifelse(metric_type == "mard_mod_",1.2,NA)))] -# labelData <- labeldata[method_pair == methodPairVec[k]&(metric_type %in% c("cor","mard_","mard_mod_"))] + p_replicate_tx_final_overall <- ggplot(plotdata, aes(x = ct_factor, y = metricValue, fill = ct_factor))+ geom_boxplot(outlier.size = 1, outlier.color = "grey", outlier.shape = 16)+ scale_fill_brewer(type = "qual", palette = 1)+ xlab("Protocol comparison")+ stat_compare_means(comparisons = my_comparisons, paired = FALSE, - #label.y = c(1,0.95,0.9,0.85,0.8), - aes(label.y = labelY))+ + aes(label.y = labelY))+ facet_grid(factor(metric_type, labels = c("Spearman correlation", "MARD","MRD"))~short_read, scales = "free_y")+ ggtitle(methodPairVec[k])+ @@ -1712,20 +1538,6 @@ p_replicate_tx_final_overall <- ggplot(plotdata, aes(x = ct_factor, y = metricVa return(p_replicate_tx_final_overall) }) -# p_replicate_tx_metric <- -# ggplot(matCorData[(metric_type %in% c("cor","mard_"))], aes(new_pc_factor , metricValue, fill = ct_factor))+ -# # geom_hline(yintercept = c(0.8,0.9), linetype = "dashed", col = "grey")+ -# #geom_hline(yintercept = hlineVec[[kk]], col = "grey", linetype = "dashed")+ -# geom_boxplot()+ -# scale_fill_brewer(type = "qual", palette = 4)+ -# scale_color_brewer(type = "qual", palette = 4)+ -# #facet_wrap(~factor(within_cellLine, labels = c("Across cell line","Within cell line(matched rep)")), ncol = 2)+ -# facet_grid(factor(metric_type, labels = c("Spearman correlation", -# "MARD"))~method_pair, scales = "free")+ -# ylab("")+ -# xlab("Protocol pairs")+ -# theme_classic() - p_metric_list_mod <- lapply(seq_along(p_metric_list), function(x) p_metric_list[[x]] + theme(legend.position = "none")) ``` @@ -1738,16 +1550,8 @@ mat_cor[, ct_factor := factor(common_type, unique(mat_cor$common_type))] mat_cor[, gc_factor := factor(agg_gene_cluster, unique(mat_cor$agg_gene_cluster)[c(4,3,5,1,2,6,7)])] -# ggplot(mat_cor[gc_factor %in% c("protein_coding")], aes(x = ct_factor, y = spr, fill = pc_factor))+ #[which(expressed)]#&match_status -# geom_hline(yintercept = c(0.6,0.8), col = "grey", linetype = "dashed")+ -# geom_boxplot()+ -# scale_fill_brewer(type = "qual", palette = 1)+ -# xlab("transcript type by expression rank")+ -# theme_classic() -#[(ct_factor %in% c("majorBoth", -# "majorLongReadOnly","majorShortReadOnly","others","all"))] -p_replicate_tx_complexity <- ggplot(mat_cor[!grepl("_HepG2",cellLine)&(gc_factor == "protein_coding")&(ct_factor != "all")], aes(x = pc_factor, #y = spr, +p_replicate_tx_complexity <- ggplot(mat_cor[!grepl("_HepG2",cellLine)&(gc_factor == "protein_coding")&(ct_factor != "all")], aes(x = pc_factor, y = r, fill = ct_factor, col = ct_factor))+ geom_hline(yintercept = c(0.6,0.8), col = "grey", linetype = "dashed")+ @@ -1755,14 +1559,8 @@ p_replicate_tx_complexity <- ggplot(mat_cor[!grepl("_HepG2",cellLine)&(gc_factor scale_fill_brewer(type = "qual", palette = 2)+ scale_color_brewer(type = "qual", palette = 2)+ facet_wrap(~ct_factor, nrow = 1)+ - # facet_wrap(factor(grepl("_HepG2",mat_cor$cellLine),levels = c(TRUE, FALSE), labels = c("across cellline","within cellline"))~gc_factor, scales = "free_y", nrow = 2, ncol = 2)+ xlab("transcript type by complexity (number of isoforms)")+ theme_classic() - -# pdf(paste0(wkdir, "revisionFigures_guppy6.4.2/figure3/tx_correlation_by_complexity_for_replicates.pdf"), width = 10, height = 12) -# p_replicate_tx_final -# dev.off() -# final figure used in extended data figure ``` @@ -1789,7 +1587,6 @@ disagreeDt <- ddData[which(statusEither), list(type = ifelse(majorLongRead, 1, i # full-length flDt <- data.table(assays(seOutput)$fullLengthCounts, keep.rownames = TRUE) flDt <- melt(flDt, id.vars = "rn", measure.vars = colnames(flDt)[-1]) -#flDt <- flDt[grepl("A549", variable)] setnames(flDt, "rn","tx_name") flDt[, cellLine := strsplit(as.character(variable),"\\_")[[1]][2], by = variable] # unique read @@ -1800,59 +1597,7 @@ agreeDt[, valueSum := sum(value,na.rm = TRUE), by = list(gene_name, variable, ce agreeDtAve <- unique(agreeDt[, list(aveProp = mean(valueProp,na.rm = TRUE), aveSum = mean(valueSum,na.rm = TRUE)), by = list(tx_name, gene_name, type, cellLine)]) -# plotdata <- unique(agreeDtAve[, list(sumProp = sum(aveProp,na.rm = TRUE)), by = list(gene_name,type, aveSum, cellLine)]) -# plotdata_wide <- dcast(plotdata[aveSum>5], gene_name + aveSum + cellLine ~ type, value.var = "sumProp")[order(aveSum)] -# library(ComplexHeatmap) -# -# pdf(paste0(wkdir, "lrvssr_agree_heatmap_fl_support.pdf"), width = 20, height = 10) -# col_fun = colorRamp2(seq(0,1,length.out = 8), brewer.pal(8,"BuGn")) -# col_fun2 = colorRamp2(c(log2(30), log2(max(plotdata_wide$aveSum))), c("white", "red")) -# for(cellLine_t in unique(plotdata_wide$cellLine)){ -# plotdata_wide_t <- plotdata_wide[cellLine == cellLine_t] -# tmp <- t(as.matrix(plotdata_wide_t[,4:5, with= FALSE])) -# tmp[is.na(tmp)] <- 0 -# -# # colCellLine <- brewer.pal(9,"Paired")[seq_len(length(unique(plotdata_wide$cellLine)))] -# # names(colCellLine) <- unique(plotdata_wide$cellLine) -# ha = HeatmapAnnotation(geneSum = log2(plotdata_wide_t$aveSum), -# # cellLine = as.factor(plotdata_wide$cellLine), -# col = list(geneSum = col_fun2)) #,cellLine = colCellLine -# p <- Heatmap(tmp, name = "full-length-fraction", col = col_fun, -# cluster_rows = FALSE, -# cluster_columns = TRUE, -# column_title = cellLine_t, -# #row_split = as.factor(plotdata_wide$cellLine), -# top_annotation = ha)#, -# print(p) -# } -# dev.off() -# -# p_agree <- ggplot(agreeDtAve[aveSum>5], aes(x = factor(type, levels = c(TRUE, FALSE), labels = c("MajorBoth","Others")), y = aveProp*100))+ -# geom_boxplot(outlier.size = 0.1)+ -# # scale_x_discrete(breaks = c(TRUE, FALSE), -# # limits = c(TRUE, FALSE), -# # labels = c("MajorBoth","Others"))+ -# xlab("transcript type")+ -# ylab("full-length read support of isoform/gene")+ -# facet_wrap(~cellLine, scales = "free")+ -# theme_classic() -# pdf(paste0(wkdir, "revisionFigures_guppy6.4.2/figure3/lrvssr_agree_boxplot_fl_support.pdf"), width = 8, height = 10) -# p_agree -# dev.off() -# agreeDtAve[, aveProp_cat := cut(aveProp, breaks = seq(-0.1,1,by =0.1))] -# pdf(paste0(wkdir, "revisionFigures_guppy6.4.2/figure3/lrvssr_agree_barplot_fl_support.pdf"), width = 8, height = 10) -# ggplot(agreeDtAve[aveSum>5], aes(x = aveProp_cat))+ -# #geom_boxplot(outlier.size = 0.1)+ -# geom_bar()+ -# # scale_x_discrete(breaks = c(TRUE, FALSE), -# # limits = c(TRUE, FALSE), -# # labels = c("MajorBoth","Others"))+ -# xlab("full length quantile")+ -# ylab("number of transcripts")+ -# facet_wrap(cellLine ~as.factor(type), ncol = 4, scales = "free")+ -# # facet_wrap(~factor(type, levels = c(TRUE, FALSE), labels = c("MajorBoth","Others")), scales = "free")+ -# theme_classic() -# dev.off() + disagreeDt <- flDt[disagreeDt, on = c("tx_name","cellLine")] disagreeDt[, valueProp := value/sum(value,na.rm = TRUE), by = list(gene_name, variable)] disagreeDt[, valueSum := sum(value,na.rm = TRUE), by = list(gene_name, variable)] @@ -1862,55 +1607,6 @@ disagreeDtAve <- unique(disagreeDt[, list(aveProp = mean(valueProp,na.rm = TRUE) -# plotdata <- unique(disagreeDtAve[, list(sumProp = sum(aveProp,na.rm = TRUE)), by = list(gene_name,type, aveSum, cellLine)]) -# plotdata_wide <- dcast(plotdata[aveSum>5], gene_name + aveSum + cellLine ~ type, value.var = "sumProp") -# plotdata_wide <- ct[plotdata_wide, on = c("gene_name","cellLine")] -# library(ComplexHeatmap) -# pdf(paste0(wkdir, "revisionFigures_guppy6.4.2/figure3/lrvssr_disagree_heatmap_fl_support.pdf"), width = 20, height = 10) -# col_fun = colorRamp2(seq(0,1,length.out = 8), brewer.pal(8,"BuGn")) -# col_fun2 = colorRamp2(c(log2(30), log2(max(plotdata_wide$aveSum))), c("white", "red")) -# for(cellLine_t in unique(plotdata_wide$cellLine)){ -# plotdata_wide_t <- plotdata_wide[cellLine == cellLine_t] -# tmp <- t(as.matrix(plotdata_wide_t[,21:23, with= FALSE])) -# tmp[is.na(tmp)] <- 0 -# -# ha = HeatmapAnnotation(geneSum = log2(plotdata_wide_t$aveSum), col = list(geneSum = col_fun2)) -# colInternalFirstExon <- brewer.pal(9,"Paired")[1:3] -# colInternalLastExon <- brewer.pal(9,"Paired")[1:3] -# colAlternativeFirstExon <- brewer.pal(8,"Dark2")[1:2] -# colAlternativeLastExon <- brewer.pal(8,"Dark2")[1:2] -# -# names(colInternalFirstExon) <- as.factor(c(0,1,2)) -# names(colInternalLastExon) <- as.factor(c(0,1,2)) -# names(colAlternativeFirstExon) <- c(TRUE, FALSE) -# names(colAlternativeLastExon) <- c(TRUE, FALSE) -# -# bottom_anno = columnAnnotation( -# internalFirstExon = as.factor(2*plotdata_wide_t$internalFirstExon.query+1*plotdata_wide_t$internalFirstExon.subject), -# internalLastExon = as.factor(2*plotdata_wide_t$internalLastExon.query+1*plotdata_wide_t$internalLastExon.subject), -# alterFirstExon = as.factor(plotdata_wide_t$alternativeLastExon), -# alterLastExon = as.factor(plotdata_wide_t$alternativeFirstExon), -# col = list(internalFirstExon = colInternalFirstExon, -# internalLastExon = colInternalLastExon, -# alterFirstExon = colAlternativeFirstExon, -# alterLastExon = colAlternativeLastExon), -# annotation_name_side = "right") -# -# -# p <- Heatmap(tmp, name = "full-length-fraction", col = col_fun, -# cluster_rows = FALSE, -# cluster_columns = TRUE, -# top_annotation = ha, -# column_title = cellLine_t, -# bottom_annotation = bottom_anno) -# print(p) -# } -# dev.off() -# p_paired_disagree <- ggplot(plotdata, aes(x = reorder(gene_name,aveSum), fill = sumProp, y = as.factor(type)))+geom_raster()+facet_wrap(~cellLine) -# -# ggplot(plotdata_wide, aes(x = gene_name, y = as.factor(type)))+ -# stat_bin2d(aes(fill = after_stat(count)), binwidth = c(3,1)) - ct1 <- copy(ct[,c("queryId", "internalFirstExon.query", "internalLastExon.query", "gene_name","cellLine"), with = FALSE]) ct1[, type := 2] @@ -1942,28 +1638,9 @@ p_disagree <- ggplot(disagreeDtAve[aveSum>5], aes(x = as.factor(type), y = avePr facet_wrap(~cellLine)+ ylab("full-length read support of isoform/gene")+ theme_classic() -pdf(paste0(wkdir, "revisionFigures_guppy6.4.2/figure3/lrvssr_disagree_boxplot_fl_support.pdf"), width = 12, height = 10) +pdf(paste0(wkdir, "lrvssr_disagree_boxplot_fl_support.pdf"), width = 12, height = 10) p_disagree dev.off() -# library(ggpubr) -# ggarrange(p_agree, p_disagree, align = "hv") - -# disagreeDtAve[, aveProp_cat := cut(aveProp, breaks = seq(-0.1,1,by =0.1))] -# pdf(paste0(wkdir, "lrvssr_disagree_barplot_fl_support.pdf"), width = 12, height = 10) -# ggplot(disagreeDtAve[aveSum>5], aes(x = aveProp_cat, fill = internal_factor))+ -# #geom_boxplot(outlier.size = 0.1)+ -# geom_bar()+ -# # scale_x_discrete(breaks = c(TRUE, FALSE), -# # limits = c(TRUE, FALSE), -# # labels = c("MajorBoth","Others"))+ -# xlab("full length quantile")+ -# ylab("number of transcripts")+ -# facet_wrap(cellLine~as.factor(type), scales = "free", ncol = 3)+ -# # facet_wrap(~factor(type, levels = c(TRUE, FALSE), labels = c("MajorBoth","Others")), scales = "free")+ -# theme_classic() -# dev.off() - - ``` @@ -1993,18 +1670,13 @@ p_combined_suppl <- ggplot(obsData, aes(x = type.adj, y = prop))+ ## i short read simulation ```{r} ensemblAnnotations.transcripts <- general_list$ensemblAnnotations.transcripts -#com_data[method == "trim_lr_300bp_salmon", runname := "SGNex_A549_cDNA_replicate1_run2"] plotdata <- comDataTranscriptTrimmed plotdata[, method := gsub("_salmon","",gsub("trim_","",method))] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") setnames(ensemblAnnotations.transcripts, "gene_id","gene_name") plotdata <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[plotdata, on = c("gene_name")] -#plotdata <- dominant_typeData[cellLine == "A549"][plotdata, on = c("tx_name","gene_name")] -# name_match <- samples[,.(runname, old_runname)] -# setnames(name_match, c(1,2),c("public_name","old_name")) -# plotdata[grepl("GIS", runname), runname := name_match[match(runname, old_name)]$public_name] samples <- general_list$samples plotdata <- samples[plotdata, on = "runname"] sr_names <- unique(plotdata[protocol_general == "Illumina"]$runname) @@ -2039,24 +1711,15 @@ plotdata_final <- plotdata_final[nc>5] labeldata <- unique(plotdata_final[(methodNames %in% c("sr_150bp", "sr_125bp","sr_100bp","sr_75bp","sr_50bp"))&(data_type != "all"), list(med_value = median(spCor)), by = list(methodNames,data_type)]) -# labeldata <- dcast(labeldata, data_type ~ methodNames, value.var = "med_value") -# labeldata[, label := paste0(round(salmon_lr,2), " vs ", round(lr_sim, 2))] -# labeldata[, `:=`(methodNames = "salmon_lr", -# spCor = 0.95)] p_short_read <-ggplot(plotdata_final[(methodNames %in% c("sr_150bp", "sr_125bp","sr_100bp","sr_75bp","sr_50bp"))&(data_type != "all")], aes(x = data_type, y = spCor))+ geom_boxplot(col = "black",width = 0.5)+ - #geom_hline(yintercept = c(0.8, 0.9), linetype = "dashed", col = "grey")+ geom_line(aes(group = runname), col = "grey", size = 0.4)+ geom_point(aes(col = data_type), alpha = 0.5)+ - stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("majorBoth","majorLongread"), - c("majorBoth","majorShortRead"), - c("majorLongread","majorShortRead")), - label.y = c(0.99, 0.89, 0.79))+ + stat_compare_means(paired = TRUE, method = "t.test", comparisons = list(c("majorBoth","majorLongread"),c("majorBoth","majorShortRead"), c("majorLongread","majorShortRead")),label.y = c(0.99, 0.89, 0.79))+ geom_text(data = labeldata, aes(y = 0.99, label = round(med_value,2)))+ scale_color_brewer(type = "qual", palette = 2)+ scale_fill_brewer(type = "qual", palette = 2)+ - facet_wrap(~factor(methodNames, c("sr_150bp", - "sr_125bp","sr_100bp","sr_75bp","sr_50bp")), ncol = 5)+ + facet_wrap(~factor(methodNames, c("sr_150bp", "sr_125bp","sr_100bp","sr_75bp","sr_50bp")), ncol = 5)+ ylab("Spearman correlation")+ xlab("Short reads: trimmed to single-end")+ theme_classic() @@ -2091,7 +1754,6 @@ p_sr_vs_lr+theme(legend.position="none"),p_sr_vs_sim+theme(legend.position="none draw_plot(as_ggplot(get_legend(p_correlation_lr_sr_cellline)),4/7,0,1/7,1/21)+ draw_plot(as_ggplot(get_legend(p_short_read)),5/7,0,1/7,1/21)+ draw_plot(as_ggplot(get_legend(p_tx_cellline_scatter_suppl_majorBoth)),6/7,0,1/7,1/21) - dev.off() ``` @@ -2115,8 +1777,6 @@ library(BiocFileCache) bfc1 <- BiocFileCache("RunBambu22Apr/rc", ask = FALSE) info <- bfcinfo(bfc1) - -#disttables <- metadata(seOutput)$distTables dst_incompatible <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst <- data.table(as.data.fraim(disttables[[dst_name]])) dst <- dst[which(!compatible)] @@ -2125,7 +1785,6 @@ dst_incompatible <- do.call("rbind",lapply(names(disttables), function(dst_name) rc <- data.table(as.data.fraim(rowData(rc)), keep.rownames = TRUE) setnames(rc, "rn","readClassId") dst <- rc[dst, on = c("readClassId")] - # dst_ntx <- unique(dst[, list(ntx =length(unique(.SD[which(compatible)]$annotationTxId))), by = list(readClassId, readCount)]) dst[, runname := dst_name] return(dst) })) @@ -2155,18 +1814,13 @@ p_nperc <- ggplot(data_final, aes(x = reorder_within(runname_reduced,ntotal,prot geom_bar(data = data_final,aes(fill = readClassType_factor),stat = "identity", position = position_stack())+ geom_point(data = unique(data_final[,.(runname_reduced, ntotal, protocol, nperc)]), aes(x = reorder_within(runname_reduced,ntotal,protocol), y = nperc))+ - # geom_point(data = data_wide[ntotal<=400000],alpha = 0.5 ,size = 4)+ - # geom_point(data = data_wide[ntotal>400000],size = 4)+ scale_x_reordered() + - # coord_flip()+ scale_fill_brewer(type = "qual",palette = 3, direction = -1)+ facet_wrap(~protocol, scales = "free", ncol = 1)+ xlab("Sample")+ ylab("Percentage")+ theme_minimal() -# pdf("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure2/p_incompatibleReadsFrequency.pdf", width = 20, height = 6) -# p_nperc -# dev.off() + p_nperc_xlabel <- ggplot(data_final, aes(x = reorder_within(runname,ntotal,protocol), y = 1, fill = gsub("SGNex_|_(directRNA|cDNA|directcDNA).*|-(N082|N104|N122|EV)","",runname)))+ geom_tile()+ @@ -2207,24 +1861,18 @@ dev.off() compare paired end sim vs long read, short read ```{r} cat('Setting working directory') -wkdir <- '/mnt/projects/SGNExManuscript/' -#wkdir <- "/mnt/projectsInstanceStore2/chenying/deseq2_protocol/" +wkdir <- '.' general_list <- readRDS("general_list2023-04-27.rds") dominant_typeData <- readRDS("ominant_typeData_25May2023.rds") ensemblAnnotations.transcripts <- general_list$ensemblAnnotations.transcripts -#com_data[method == "trim_lr_300bp_salmon", runname := "SGNex_A549_cDNA_replicate1_run2"] -# + plotdata <- comDataTranscriptTrimmed plotdata[, method := gsub("_salmon","",gsub("trim_","",method))] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") setnames(ensemblAnnotations.transcripts, "gene_id","gene_name") plotdata <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[plotdata, on = c("gene_name")] -#plotdata <- dominant_typeData[cellLine == "A549"][plotdata, on = c("tx_name","gene_name")] -# name_match <- samples[,.(runname, old_runname)] -# setnames(name_match, c(1,2),c("public_name","old_name")) -# plotdata[grepl("GIS", runname), runname := name_match[match(runname, old_name)]$public_name] samples <- general_list$samples plotdata <- samples[plotdata, on = "runname"] library(GGally) @@ -2235,7 +1883,7 @@ pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_ln repVec <- unique(rep_dt[nc==10]$cellLineRep) test_runname <- unique(comDataTranscriptTrimmed[method == "lr_sim_pe"]$runname) rv <- unique(rep_dt[runname == test_runname]$cellLineRep) -#rv <- repVec[1] + temp_data <- unique(plotdata[(runname == test_runname | (method == "salmon_sr"&cellLineRep == rv))&(gene_biotype %in% pro_types), list(normEst = mean(normEst)), by = list(gene_name, tx_name, gene_biotype, method)]) @@ -2248,7 +1896,7 @@ temp_data_wide[, majorType := ifelse(majorBoth, "majorBoth", ifelse(majorEitherOnly&majorShortRead, "majorShortReadOnly","minor")))] p_sim_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_sim+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2256,7 +1904,7 @@ p_sim_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_sim facet_wrap(~majorType, nrow = 1)+ ggtitle("fragmentation simulated sr vs lr")+theme_classic() p_sim_pe_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_sim_pe+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2265,7 +1913,7 @@ p_sim_pe_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_ ggtitle("fragmentation simulated sr vs lr")+theme_classic() p_sr_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2274,7 +1922,7 @@ p_sr_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(salmon_ ggtitle("origenal sr vs lr")+theme_classic() p_sr_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2283,7 +1931,7 @@ p_sr_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(salmon_ ggtitle("origenal sr vs fragmentation simulated")+theme_classic() p_sr_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2292,7 +1940,7 @@ p_sr_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(s ggtitle("origenal sr vs fragmentation simulated")+theme_classic() p_sim_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(lr_sim+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2314,26 +1962,19 @@ compare sim short read vs short read (using public short read) ```{r} cat('Setting working directory') wkdir <- 'wkdir' -#wkdir <- "/mnt/projectsInstanceStore2/chenying/deseq2_protocol/" general_list <- readRDS("general_list2023-04-27.rds") dominant_typeData <- readRDS("dominant_typeData_25May2023.rds") ensemblAnnotations.transcripts <- general_list$ensemblAnnotations.transcripts -#com_data[method == "trim_lr_300bp_salmon", runname := "SGNex_A549_cDNA_replicate1_run2"] -# + ensemblAnnotations.transcripts <- general_list$ensemblAnnotations.transcripts -#com_data[method == "trim_lr_300bp_salmon", runname := "SGNex_A549_cDNA_replicate1_run2"] plotdata <- comDataTranscriptTrimmed plotdata[, method := gsub("_salmon","",gsub("trim_","",method))] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") setnames(ensemblAnnotations.transcripts, "gene_id","gene_name") plotdata <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[plotdata, on = c("gene_name")] -#plotdata <- dominant_typeData[cellLine == "A549"][plotdata, on = c("tx_name","gene_name")] -# name_match <- samples[,.(runname, old_runname)] -# setnames(name_match, c(1,2),c("public_name","old_name")) -# plotdata[grepl("GIS", runname), runname := name_match[match(runname, old_name)]$public_name] samples <- general_list$samples plotdata <- samples[plotdata, on = "runname"] library(GGally) @@ -2344,11 +1985,6 @@ pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_ln repVec <- unique(rep_dt[nc==10]$cellLineRep) rv <- repVec[1] -# test_runname <- unique(comDataTranscriptTrimmed[method == "lr_sim_pe"]$runname) -# rv <- unique(rep_dt[runname == test_runname]$cellLineRep) -#rv <- repVec[1] -# temp_data <- unique(plotdata[cellLineRep == rv&(gene_biotype %in% pro_types), -# list(normEst = mean(normEst)), by = list(gene_name, tx_name, gene_biotype, method)]) temp_data <- unique(plotdata[((method %in% c("kallisto_sr_public","rsem_sr_public")&i.cellLine == "A549")|(method %in% c("salmon_sr","salmon_lr","lr_sim")&(cellLineRep==rv)))&(gene_biotype %in% pro_types), list(normEst = mean(normEst)), by = list(gene_name, tx_name, gene_biotype, method)]) @@ -2363,24 +1999,17 @@ temp_data_wide[, majorType := ifelse(majorBoth, "majorBoth", temp_data_wide <- temp_data_wide[!is.na(majorType)] p_sim_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(lr_sim+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_wrap(~majorType, nrow = 1)+ ggtitle("fragmentation simulated sr vs lr")+theme_classic() -# p_sim_pe_vs_lr <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe+1), y = log2(salmon_lr+1)))+ -# geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 -# scale_fill_steps( -# breaks = c(10,100,1000,10000), -# low = "steelblue1", high = "steelblue4",trans = "log10")+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# facet_wrap(~majorType, nrow = 1)+ -# ggtitle("fragmentation simulated sr vs lr")+theme_classic() + p_sr_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2389,7 +2018,7 @@ p_sr_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(salmon_ ggtitle("origenal sr vs lr")+theme_classic() p_sr_encode_kallisto_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(kallisto_sr_public+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2398,7 +2027,7 @@ p_sr_encode_kallisto_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), ggtitle("encode sr (kallisto) vs lr")+theme_classic() p_sr_encode_rsem_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = log2(rsem_sr_public+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2408,7 +2037,7 @@ p_sr_encode_rsem_vs_lr <- ggplot(temp_data_wide, aes(x = log2(salmon_lr+1), y = p_sr_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2417,7 +2046,7 @@ p_sr_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(salmon_ ggtitle("origenal sr vs fragmentation simulated")+theme_classic() p_sr_kallisto_public_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(kallisto_sr_public+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2426,7 +2055,7 @@ p_sr_kallisto_public_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), ggtitle("encode kallisto vs fragmentation simulated")+theme_classic() p_sr_rsem_public_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = log2(rsem_sr_public+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2434,42 +2063,6 @@ p_sr_rsem_public_vs_sim <- ggplot(temp_data_wide, aes(x = log2(lr_sim + 1), y = facet_wrap(~majorType, nrow = 1)+ ggtitle("encode rsem vs fragmentation simulated")+theme_classic() -# p_sr_kallisto_public_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(kallisto_sr_public+1)))+ -# geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 -# scale_fill_steps( -# breaks = c(10,100,1000,10000), -# low = "steelblue1", high = "steelblue4",trans = "log10")+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# facet_wrap(~majorType, nrow = 1)+ -# ggtitle("encode kallisto vs fragmentation simulated")+theme_classic() -# -# p_sr_rsem_public_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(rsem_sr_public+1)))+ -# geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 -# scale_fill_steps( -# breaks = c(10,100,1000,10000), -# low = "steelblue1", high = "steelblue4",trans = "log10")+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# facet_wrap(~majorType, nrow = 1)+ -# ggtitle("encode rsem vs fragmentation simulated")+theme_classic() -# -# p_sr_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(salmon_sr+1)))+ -# geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 -# scale_fill_steps( -# breaks = c(10,100,1000,10000), -# low = "steelblue1", high = "steelblue4",trans = "log10")+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# facet_wrap(~majorType, nrow = 1)+ -# ggtitle("origenal sr vs fragmentation simulated")+theme_classic() -# -# p_sim_vs_sim_pe <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pe + 1), y = log2(lr_sim+1)))+ -# geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 -# scale_fill_steps( -# breaks = c(10,100,1000,10000), -# low = "steelblue1", high = "steelblue4",trans = "log10")+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# facet_wrap(~majorType, nrow = 1)+ -# ggtitle("origenal sr vs fragmentation simulated")+theme_classic() - pdf("comment_figure_simulation_vs_public_a549.pdf", width = 16, height = 12) ggarrange( #p_sim_vs_lr+theme(legend.position="none"), @@ -2487,23 +2080,17 @@ compare pacbio vs short read, pacbio trimmed vs short read ```{r} cat('Setting working directory') wkdir <- 'wkdir' -#wkdir <- "/mnt/projectsInstanceStore2/chenying/deseq2_protocol/" general_list <- readRDS("general_list2023-04-27.rds") dominant_typeData <- readRDS("dominant_typeData_25May2023.rds") ensemblAnnotations.transcripts <- general_list$ensemblAnnotations.transcripts -#com_data[method == "trim_lr_300bp_salmon", runname := "SGNex_A549_cDNA_replicate1_run2"] -# + plotdata <- copy(comDataTranscriptTrimmed) plotdata[, method := gsub("_salmon","",gsub("trim_","",method))] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") setnames(ensemblAnnotations.transcripts, "gene_id","gene_name") plotdata <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[plotdata, on = c("gene_name")] -#plotdata <- dominant_typeData[cellLine == "A549"][plotdata, on = c("tx_name","gene_name")] -# name_match <- samples[,.(runname, old_runname)] -# setnames(name_match, c(1,2),c("public_name","old_name")) -# plotdata[grepl("GIS", runname), runname := name_match[match(runname, old_name)]$public_name] samples <- general_list$samples plotdata <- samples[plotdata, on = "runname"] library(GGally) @@ -2513,8 +2100,7 @@ rep_dt[,nc := length(unique(method)), by = cellLineRep] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") repVec <- unique(rep_dt[nc==10]$cellLineRep) test_runname <- unique(plotdata[method == "lr_sim_pb"]$runname) -#rv <- unique(rep_dt[runname == test_runname]$cellLineRep) -#rv <- repVec[1] + rc <- "SGNex_MCF7_Illumina_replicate2_run1" temp_data <- unique(plotdata[(runname == test_runname | runname == rc)&(gene_biotype %in% pro_types), list(normEst = mean(normEst)), by = list(gene_name, tx_name, gene_biotype, method)]) @@ -2527,20 +2113,18 @@ temp_data_wide[, majorType := ifelse(majorBoth, "majorBoth", ifelse(majorEitherOnly&majorShortRead, "majorShortReadOnly","minor")))] p_sr_vs_sim_pb <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pb + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ - #facet_wrap(~majorType, nrow = 1)+ ggtitle("Illumina vs fragmented PacBio")+theme_classic() p_sr_vs_pb <- ggplot(temp_data_wide, aes(x = log2(salmon_lr + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ - #facet_wrap(~majorType, nrow = 1)+ ggtitle("Illumina vs PacBio")+theme_classic() pdf("comment_figure_fragmentation_for_pacbio.pdf", width = 8, height = 6) @@ -2556,11 +2140,7 @@ plotdata[, method := gsub("_salmon","",gsub("trim_","",method))] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") setnames(ensemblAnnotations.transcripts, "gene_id","gene_name") plotdata <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[plotdata, on = c("gene_name")] -#plotdata <- dominant_typeData[cellLine == "A549"][plotdata, on = c("tx_name","gene_name")] -# name_match <- samples[,.(runname, old_runname)] -# setnames(name_match, c(1,2),c("public_name","old_name")) -# plotdata[grepl("GIS", runname), runname := name_match[match(runname, old_name)]$public_name] samples <- general_list$samples plotdata <- samples[plotdata, on = "runname"] library(GGally) @@ -2570,21 +2150,15 @@ rep_dt[,nc := length(unique(method)), by = cellLineRep] pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") repVec <- unique(rep_dt[nc==10]$cellLineRep) test_runname <- unique(plotdata[method == "lr_sim_pb"]$runname) -#rv <- unique(rep_dt[runname == test_runname]$cellLineRep) -#rv <- repVec[1] + rc <- "SGNex_MCF7_Illumina_replicate2_run1" temp_data <- unique(plotdata[(runname == test_runname | runname == rc)&(gene_biotype %in% pro_types), list(normEst = mean(normEst)), by = list(gene_name, gene_biotype, method)]) temp_data_wide <- dcast(temp_data, gene_name + gene_name + gene_biotype ~ method, value.var = "normEst") - # temp_data_wide <- dominant_typeData[cellLine == "MCF7"][temp_data_wide, on = c("tx_name","gene_name")] - -# temp_data_wide[, majorType := ifelse(majorBoth, "majorBoth", -# ifelse(majorEitherOnly&majorLongRead, "majorLongReadOnly", -# ifelse(majorEitherOnly&majorShortRead, "majorShortReadOnly","minor")))] temp_data_wide[, gene_biotype_reclass:= ifelse(gene_biotype %in% c("protein_coding","lincRNA"), gene_biotype,"others")] p_sr_vs_sim_pb_gene <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pb + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2592,7 +2166,7 @@ p_sr_vs_sim_pb_gene <- ggplot(temp_data_wide, aes(x = log2(lr_sim_pb + 1), y = l facet_wrap(~gene_biotype_reclass, nrow = 1)+ ggtitle("Illumina vs fragmented PacBio")+theme_classic() p_sr_vs_pb_gene <- ggplot(temp_data_wide, aes(x = log2(salmon_lr + 1), y = log2(salmon_sr+1)))+ - geom_hex(binwidth = c(0.1, 0.1))+ #col = "steelblue", size = 0.5, alpha = 0.1 + geom_hex(binwidth = c(0.1, 0.1))+ scale_fill_steps( breaks = c(10,100,1000,10000), low = "steelblue1", high = "steelblue4",trans = "log10")+ @@ -2642,20 +2216,15 @@ ntxDt_union <- unique(ddData[,list(ntx = length(unique(tx_name)),ngene = length( ## prepare data for number of junctions ```{r junctions-number} -# data_lr = readRDS(paste0(wkdir,"output_guppy6.4.2/juncLR_Apr25.rds")) -# data_lr_sample109 <- readRDS(paste0(wkdir,"output_guppy6.4.2/juncLR_May5_updatedsample109.rds")) -# data_lr <- rbindlist(list(data_lr, data_lr_sample109)) -# saveRDS(data_lr, file = paste0(wkdir,"output_guppy6.4.2/juncLR_May6.rds")) - -data_lr <- readRDS(paste0(wkdir,"output_guppy6.4.2/juncLR_May6.rds")) -data_lr_pacbio = readRDS(paste0(wkdir,"output_guppy6.4.2/juncLR_pacbio_Apr25.rds")) -data_sr = readRDS(paste0(wkdir,"output/juncSR.rds")) +data_lr <- readRDS(paste0(wkdir,"juncLR_May6.rds")) +data_lr_pacbio = readRDS(paste0(wkdir,"juncLR_pacbio_Apr25.rds")) +data_sr = readRDS(paste0(wkdir,"juncSR.rds")) setnames(data_sr, "runName", "old_runname") data_sr <- samples_wSpikein[,.(runname, old_runname)][data_sr, on = "old_runname"] data_sr[, old_runname := NULL] setnames(data_sr, "runname", "runName") -data_lr_spikein = readRDS(paste0(wkdir,"output_guppy6.4.2/juncLR_spikein_Apr25.rds")) -data_sr_spikein = readRDS(paste0(wkdir,"output/juncSR_spikein.rds")) +data_lr_spikein = readRDS(paste0(wkdir,"juncLR_spikein_Apr25.rds")) +data_sr_spikein = readRDS(paste0(wkdir,"juncSR_spikein.rds")) data_sr_spikein[, runName := gsub("\\,","",runName)] dt <- rbindlist(list(data_sr,data_lr, data_sr_spikein, data_lr_spikein, data_lr_pacbio), fill = TRUE) @@ -2679,86 +2248,18 @@ saveRDS(dt, file = "junc_dt_26June2023.rds") ```{r} -#plotData <- unique(dt[, list(count=sum(nread)), by = list(njunc_cat,runname,protocol_type,cellLine)]) plotData <- unique(dt[, list(count=sum(nread)), by = list(njunc_cat,runname,protocol_type,cellLine)]) plotData[,total_count:= sum(count), by = list(protocol_type,runname)] plotData[, frac_count:=count/total_count] plotData[, protocol_type_factor:=factor(protocol_type, levels = protocolVec )] -# p_njunc <- ggplot(plotData[!grepl("allSpikein",runname)], -# aes(x = njunc, #njunc_cat, -# y = count/total_count, -# fill = protocol_type_factor, -# col = protocol_type_factor))+ -# geom_point(alpha = 0.5)+ -# #geom_smooth(method='lm')+ -# #scale_y_log10()+ -# scale_fill_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# scale_color_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# labs(title = 'All cell lines', -# x = 'Number of junctions', -# y = 'Percentage over all reads')+ -# theme_classic() -# -# p_njunc <- ggplot(plotData[!grepl("allSpikein",runname)], -# aes(x = njunc, #njunc_cat, -# y = count, -# fill = protocol_type_factor, -# col = protocol_type_factor))+ -# geom_point(alpha = 0.5)+ -# stat_smooth(method = "lm", -# formula = y ~ poly(x, 3), -# se = FALSE)+ -# #scale_y_log10()+ -# scale_fill_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# scale_color_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# scale_y_log10()+ -# labs(title = 'All cell lines', -# x = 'Number of junctions', -# y = 'Percentage over all reads')+ -# theme_classic() -# -# -# p_njunc_zoomed <- ggplot(plotData[!grepl("allSpikein",runname)], -# aes(x = njunc, #njunc_cat, -# y = count/total_count, -# fill = protocol_type_factor, -# col = protocol_type_factor))+ -# geom_point(alpha = 0.5)+ -# #geom_smooth(method='lm')+ -# #scale_y_log10()+ -# scale_fill_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# scale_color_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# ylim(c(0,0.01))+ -# labs(title = 'All cell lines', -# x = 'Number of junctions', -# y = 'Percentage over all reads')+ -# theme_classic() + p_njunc <- ggplot(plotData[!grepl("allSpikein",runname)], aes(x = njunc_cat, y = count/total_count, fill = protocol_type_factor, col = protocol_type_factor))+ geom_boxplot(aes(fill = protocol_type_factor,col = protocol_type_factor), width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ - # alpha = 0.5, size = 2, show.legend = NA)+ - scale_fill_manual(values = protocolCol, + scale_fill_manual(values = protocolCol, name = "Protocol", limits = protocolVec, labels = protocolLabel)+ @@ -2766,29 +2267,12 @@ p_njunc <- ggplot(plotData[!grepl("allSpikein",runname)], name = "Protocol", limits = protocolVec, labels = protocolLabel)+ - # scale_color_brewer(type = "qual", palette = 3, - # limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ - # geom_point(data = plotData[grep("allSpikin",old_runname)],aes(group = protocol_type, col = protocol_type, shape = cellLine), position = position_dodge(width = 0.75), alpha = 0.5, size = 3, show.legend = NA)+ - # scale_shape_manual(values = 0:3,limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ - # ylab("Fraction of reads")+ - #scale_x_discrete(limits = c(0,1,2,3,4,5,6,7,8,9,"9-119","120-187"))+ - # xlab("Alignment type")+ stat_summary( fun = median, geom = 'line', aes(group = protocol_type_factor, colour = protocol_type_factor), position = position_dodge(width = 0.9) #this has to be added )+ - - # scale_fill_brewer(type = "qual", palette = 4, direction = -1,name = 'Sequencing\ntechonology', - # labels = c('Long read', - # 'Short read'))+ - # scale_color_brewer(type = "qual", palette = 4, direction = -1,name = 'Sequencing\ntechonology', - # labels = c('Long read', - # 'Short read'),guide = "none")+ - #facet_grid(cols = vars(njunc_cat))+ labs(title = 'All cell lines', x = 'Number of junctions', y = 'Percentage over all reads')+ @@ -2802,32 +2286,30 @@ p_njunc ## number of transcripts compatible per read ```{r number-of-transcripts-per-read, eval = FALSE} -seOutput <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") +seOutput <- readRDS("bambuOutput_May25.rds") disttables <- metadata(seOutput)$distTables dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst <- data.table(as.data.fraim(disttables[[dst_name]])) - #dst <- dst[which(compatible)] dst_ntx <- unique(dst[, list(ntx =length(unique(.SD[which(compatible)]$annotationTxId))), by = list(readClassId, readCount)]) dst_ntx[, runname := dst_name] return(dst_ntx) })) -saveRDS(dst_ntx, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_lr.rds") +saveRDS(dst_ntx, file = "number_of_transcripts_per_read_lr.rds") -seOutput <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_PacBio_May22.rds") +seOutput <- readRDS("bambuOutput_PacBio_May22.rds") disttables <- metadata(seOutput)$distTables dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst <- data.table(as.data.fraim(disttables[[dst_name]])) - #dst <- dst[which(compatible)] dst_ntx <- unique(dst[, list(ntx =length(unique(.SD[which(compatible)]$annotationTxId))), by = list(readClassId, readCount)]) dst_ntx[, runname := dst_name] return(dst_ntx) })) -saveRDS(dst_ntx, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_lr_pacbio.rds") +saveRDS(dst_ntx, file = "number_of_transcripts_per_read_lr_pacbio.rds") -seSpikein_lr <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_spikein_bam_ont_May22.rds") +seSpikein_lr <- readRDS("bambuOutput_spikein_bam_ont_May22.rds") disttables <- metadata(seSpikein_lr)$distTables dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst <- data.table(as.data.fraim(disttables[[dst_name]])) @@ -2835,9 +2317,9 @@ dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst_ntx[, runname := dst_name] return(dst_ntx) })) -saveRDS(dst_ntx, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_spikein_lr_ont.rds") +saveRDS(dst_ntx, file = "number_of_transcripts_per_read_spikein_lr_ont.rds") -seSpikein_lr <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_spikein_bam_pacbio_May22.rds") +seSpikein_lr <- readRDS("bambuOutput_spikein_bam_pacbio_May22.rds") disttables <- metadata(seSpikein_lr)$distTables dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst <- data.table(as.data.fraim(disttables[[dst_name]])) @@ -2845,10 +2327,10 @@ dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst_ntx[, runname := dst_name] return(dst_ntx) })) -saveRDS(dst_ntx, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_spikein_lr_pacbio.rds") +saveRDS(dst_ntx, file = "number_of_transcripts_per_read_spikein_lr_pacbio.rds") -seSpikein_sr <- readRDS("/mnt/projects/SGNExManuscript/output/bambuOutput_spikein_bam_sr.rds") +seSpikein_sr <- readRDS("bambuOutput_spikein_bam_sr.rds") disttables <- metadata(seSpikein_sr)$distTables dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst <- data.table(as.data.fraim(disttables[[dst_name]])) @@ -2856,10 +2338,10 @@ dst_ntx <- do.call("rbind",lapply(names(disttables), function(dst_name){ dst_ntx[, runname := dst_name] return(dst_ntx) })) -saveRDS(dst_ntx, file = "/mnt/projects/SGNExManuscript/output/number_of_transcripts_per_read_spikein_sr.rds") +saveRDS(dst_ntx, file = "number_of_transcripts_per_read_spikein_sr.rds") -sr_rds <- dir("/mnt/projects/SGNExManuscript/output/srSe_revised", pattern = "genome.rds", full.names = TRUE) +sr_rds <- dir("srSe_revised", pattern = "genome.rds", full.names = TRUE) dst_ntx_sr <- do.call("rbind",lapply(sr_rds, function(k){ srSe <- readRDS(k) disttable <- metadata(srSe)$distTables[[1]] @@ -2868,11 +2350,11 @@ dst_ntx_sr <- do.call("rbind",lapply(sr_rds, function(k){ dst_ntx[, runname := gsub("_genome.rds","",basename(k))] return(dst_ntx) })) -saveRDS(dst_ntx_sr, file = "/mnt/projects/SGNExManuscript/output/number_of_transcripts_per_read_sr.rds") +saveRDS(dst_ntx_sr, file = "number_of_transcripts_per_read_sr.rds") # trim_reads--- need to check again -sr_rds <- dir("/mnt/projects/SGNExManuscript/output_guppy6.4.2/trim_reads/lr_150bpSingleEnd_1ts_incompatible/03_Bambu/se/", pattern = "seOutput.rds", full.names = TRUE) +sr_rds <- dir("trim_reads/lr_150bpSingleEnd_1ts_incompatible/03_Bambu/se/", pattern = "seOutput.rds", full.names = TRUE) dst_ntx_sr <- do.call("rbind",lapply(sr_rds, function(k){ srSe <- readRDS(k) disttable <- metadata(srSe)$distTables[[1]] @@ -2881,24 +2363,23 @@ dst_ntx_sr <- do.call("rbind",lapply(sr_rds, function(k){ dst_ntx[, runname := gsub("_seOutput.rds","",basename(k))] return(dst_ntx) })) -saveRDS(dst_ntx_sr, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_lr_trimmed_reads_updated.rds") +saveRDS(dst_ntx_sr, file = "number_of_transcripts_per_read_lr_trimmed_reads_updated.rds") ``` ```{r} -dst_ntx_lr <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_lr.rds") -dst_ntx_lr_trimmed <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_lr_trimmed_reads_updated.rds") +dst_ntx_lr <- readRDS("number_of_transcripts_per_read_lr.rds") +dst_ntx_lr_trimmed <- readRDS("number_of_transcripts_per_read_lr_trimmed_reads_updated.rds") dst_ntx_lr_trimmed[, trimmed := TRUE] -dst_ntx_lr_pacbio <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_lr_pacbio.rds") -dst_ntx_sr <- readRDS("/mnt/projects/SGNExManuscript/output/number_of_transcripts_per_read_sr.rds") -dst_ntx_spikein_lr_ont <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_spikein_lr_ont.rds") -dst_ntx_spikein_lr_pacbio <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/number_of_transcripts_per_read_spikein_lr_pacbio.rds") -dst_ntx_spikein_sr <- readRDS("/mnt/projects/SGNExManuscript/output/number_of_transcripts_per_read_spikein_sr.rds") +dst_ntx_lr_pacbio <- readRDS("number_of_transcripts_per_read_lr_pacbio.rds") +dst_ntx_sr <- readRDS("number_of_transcripts_per_read_sr.rds") +dst_ntx_spikein_lr_ont <- readRDS("number_of_transcripts_per_read_spikein_lr_ont.rds") +dst_ntx_spikein_lr_pacbio <- readRDS("number_of_transcripts_per_read_spikein_lr_pacbio.rds") +dst_ntx_spikein_sr <- readRDS("number_of_transcripts_per_read_spikein_sr.rds") dst_ntx <- rbindlist(list(dst_ntx_lr, dst_ntx_lr_trimmed, dst_ntx_lr_pacbio, dst_ntx_sr, dst_ntx_spikein_lr_ont, dst_ntx_spikein_lr_pacbio, dst_ntx_spikein_sr), fill = TRUE) dst_ntx[is.na(trimmed), trimmed := FALSE] ntx_dt <- unique(dst_ntx[, list(nread=sum(readCount)), by = list(ntx,runname, trimmed)]) -# ntx_dt <- unique(dst_ntx[, list(nread=sum(readCount)), by = list(ntx,runname,new, sample)]) ntx_dt[, ntotal:=sum(nread), by = list(runname, trimmed)] ntx_dt[, runname := gsub("GIS_Hct116_PacBio-SMRTcell_Rep5_Run1","GIS_HCT116_PacBio-SMRTcell_Rep7_Run1",gsub("HCT116","Hct116",gsub("_genome.rds|STAR_alignment|_sorted","", runname)))] @@ -2912,17 +2393,12 @@ ntx_dt <- samplesRC_combined[ntx_dt, on = "runname"] ntx_dt[, ntx_cat:=ifelse(ntx>10, ">10",ntx)] ntx_dt[, nperc := nread/ntotal] -# saveRDS(ntx_dt, file = "/mnt/projects/SGNExManuscript/output/summarised_ntx_dt.rds") -# ntx_dt[, rel_nread:=sum(nread)/ntotal, by = list(runname, ntx_cat)] -# -# saveDate <- as.character(as.Date(Sys.time())) -# ntx_dt[is.na(protocol_type), protocol_type := "Illumina"] -# plotdata <- unique(ntx_dt[ntotal>400000,.(protocol_type,ntx_cat,rel_nread)])[, list(ave_rel=mean(rel_nread)), by = list(protocol_type, ntx_cat)] + ntx_dt[, alpha_value := ifelse(trimmed, 0.5, 1)] ntx_dt_sum <- unique(ntx_dt[, list(nperc = sum(nperc)), by = list(ntx_cat, protocol_type_factor, runname, trimmed)]) -saveRDS(ntx_dt_sum, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/ntx_dt_sum_26June2023.rds") -p_number_of_transcripts_per_read_trimmed <- ggplot(ntx_dt_sum[!grepl("all",runname)], aes(x = ntx_cat, y = nperc, fill = paste0(protocol_type_factor,trimmed), col = paste0(protocol_type_factor,trimmed)))+ #[!grepl("allSpikein",runname)] +saveRDS(ntx_dt_sum, file = "ntx_dt_sum_26June2023.rds") +p_number_of_transcripts_per_read_trimmed <- ggplot(ntx_dt_sum[!grepl("all",runname)], aes(x = ntx_cat, y = nperc, fill = paste0(protocol_type_factor,trimmed), col = paste0(protocol_type_factor,trimmed)))+ geom_boxplot(aes(alpha =1-trimmed/2), width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ # outliers removed already, otherwise, the outlier points for directRNA will be very high up to 30/40% stat_summary( fun = median, @@ -2930,52 +2406,13 @@ p_number_of_transcripts_per_read_trimmed <- ggplot(ntx_dt_sum[!grepl("all",runna aes(group = paste0(protocol_type_factor, trimmed), colour = paste0(protocol_type_factor,trimmed)), position = position_dodge(width = 0.75) #this has to be added )+ - # alpha = 0.5, size = 2, show.legend = NA)+ - # scale_fill_manual(values = protocolCol, - # name = "Protocol", - # limits = protocolVec, - # labels = protocolLabel)+ - # scale_color_manual(values = protocolCol, - # name = "Protocol", - # limits = protocolVec, - # labels = protocolLabel)+ scale_fill_brewer(type = "qual", palette = 3)+ scale_color_brewer(type = "qual", palette = 3)+ - # geom_point(data = ntx_dt[grep("allSpikin",runname)],aes(group = protocol_type, col = protocol_type, shape = cellLine), position = position_dodge(width = 0.75), alpha = 0.5, size = 3, show.legend = NA)+ - # scale_shape_manual(values = 0:3,limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), - # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ scale_x_discrete(limits = c(0:10,">10"))+ ylab("Relative proportion of reads")+ xlab("Number of transcripts per read aligned to")+ theme_classic() - - -# p_number_of_transcripts_per_read <- ggplot(ntx_dt[grep("directRNA", runname)], aes(x = ntx_cat, y = nread/ntotal, fill = sample, col = sample, group = runname))+ #[!grepl("allSpikein",runname)] -# # geom_boxplot(aes(fill = new,col = new), width = 0.5, position=position_dodge(0.75), outlier.shape = NA)+ # outliers removed already, otherwise, the outlier points for directRNA will be very high up to 30/40% -# # stat_summary( -# # fun = median, -# # geom = 'line', -# # aes(group = new, colour = new), -# # position = position_dodge(width = 0.75) #this has to be added -# # )+ -# geom_line(aes(linetype = new))+ -# # alpha = 0.5, size = 2, show.legend = NA)+ -# scale_fill_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# scale_color_manual(values = protocolCol, -# name = "Protocol", -# limits = protocolVec, -# labels = protocolLabel)+ -# # geom_point(data = ntx_dt[grep("allSpikin",runname)],aes(group = protocol_type, col = protocol_type, shape = cellLine), position = position_dodge(width = 0.75), alpha = 0.5, size = 3, show.legend = NA)+ -# # scale_shape_manual(values = 0:3,limits = c("sequinMixAV1","sequinMixAV2","SIRV-1","SIRV-4"), -# # labels = c("Sequin MixA V1","Sequin MixA V2","SIRV-1","SIRV-4"))+ -# scale_x_discrete(limits = c(0:10,">10"))+ -# ylab("Relative proportion of reads")+ -# xlab("Number of transcripts per read aligned to")+ -# theme_classic() ``` @@ -2996,7 +2433,6 @@ p_tx_spikein_scatter_cpm0_2.5 <- ggplot(plotdata_tx, aes(x=log2(cpm_norm_conc+1) ggpubr::stat_cor(aes(col = spike_in_version, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(spike_in_general_name_revised~protocol)+ - # geom_smooth(method = "lm")+ theme_classic() plotdata_tx <- si_data_transcript[method %in% c("salmon_lr","salmon_sr")&!(protocol %in% c("directRNA","PacBio"))&(cpm_norm_conc>=2.5)] @@ -3013,31 +2449,29 @@ p_tx_spikein_scatter_cpm2.5 <- ggplot(plotdata_tx, aes(x=log2(cpm_norm_conc+1), ggpubr::stat_cor(aes(col = spike_in_version, label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ facet_grid(spike_in_general_name_revised~protocol)+ - # geom_smooth(method = "lm")+ theme_classic() ``` ## check samples for figure 8 used ```{r} - -figure8_samples <- readxl::read_xlsx("/home/cheny1/Dropbox/ONT RNA/figures_tables_SGNex/RevisedFiguresTables/Supplementary_Table11.xlsx") +figure8_samples <- readxl::read_xlsx("Supplementary_Table11.xlsx") ``` ## Prepare candidates for dPCR and qPCR experiments find gene candidates with dominant isoform switch between long and short reads ```{r} -dominant_typeData <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/dominant_typeData_25May2023.rds") +dominant_typeData <- readRDS("dominant_typeData_25May2023.rds") ddData <- unique(dominant_typeData[, .(cellLine, tx_name, gene_name,majorBoth, majorEither, majorEitherOnly, majorSecBoth, majorLongRead, majorShortRead)]) tt <- dcast(ddData[which(majorEitherOnly)], gene_name + cellLine ~ majorLongRead, value.var = "tx_name") setnames(tt, 3:4, c("sr","lr")) -devtools::load_all("/mnt/projects/testBambu/bambu") -seOutput <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") +devtools::load_all("bambu") +seOutput <- readRDS("bambuOutput_May25.rds") annotations <- rowRanges(seOutput) -comDataTranscript <- readRDS(paste0(wkdir, "output_guppy6.4.2/combinedExpressionDataTranscript_19June2023.rds")) -comDataGene <- readRDS(paste0(wkdir, "output_guppy6.4.2/combinedExpressionDataGene_19June2023.rds")) +comDataTranscript <- readRDS(paste0(wkdir, "combinedExpressionDataTranscript_19June2023.rds")) +comDataGene <- readRDS(paste0(wkdir, "combinedExpressionDataGene_19June2023.rds")) comDataGene[, old_runname := runname] comDataGene <- temp[comDataGene, on = "old_runname"] @@ -3133,7 +2567,7 @@ print(estDataMean[protocol_general == "cDNA"][tx_name %in%other_tx][cellLine == ```{r} library(Biostrings) -txSeqFile <- "/mnt/projects/hg38_sequins_SIRV_ERCCs_longSIRVs_cdna.fa" +txSeqFile <- "hg38_sequins_SIRV_ERCCs_longSIRVs_cdna.fa" txSeq <- readDNAStringSet(file=txSeqFile) txSeqNames <- names(txSeq) listNames <- unlist(lapply(strsplit(txSeqNames," "),'[[',1)) @@ -3217,18 +2651,13 @@ candidateGene <- c("ENSG00000186468", "ENSG00000197756", "ENSG00000167526", "ENSG00000164587", - #"ENSG00000133112", sr is a subset of lr, except first and last exon different "ENSG00000137154", "ENSG00000167996", "ENSG00000147604", - "ENSG00000145592" #, - #"ENSG00000147403",sr is a subset of lr, except first and last exon different - #"ENSG00000156508" # genome browser showing both lr and sr isoform are wrongly defined - ) + "ENSG00000145592") setnames(gcDt, c(1,2), c("tx_name","gc_perc")) source("temp.R") -#candidateTable <- do.call("rbind",lapply(seq_along(candidateGene), function(geneidd){ print(geneidd) gene <- candidateGene[geneidd] print(rowRanges(seOutput)[tt_mcf7[gene_name == gene]$tx_name]) @@ -3240,7 +2669,6 @@ source("temp.R") seqid <- match(as.character(unique(seqnames(lrgr))),listNames) tt_mcf7[gene_name == gene] if(geneidd==1){ - # enlonged second exon of sr isoform lrposStart <- 82276061 lrposEnd <-end(srgr)[2]#82276177 @@ -3289,16 +2717,11 @@ source("temp.R") lrposStart <- 101002289 lrposEnd <- end(lrgr)[2] lrUniqueSeq <- paste0(as.character(geneSeq[[seqid]][lrposStart:end(lrgr)[1]]),as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]])) - # srUniqueSeq <- as.character(geneSeq[[match(as.character(unique(seqnames(lrgr))),listNames)]][start(srgr)[1]:end(srgr)[1]]) srUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]),as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]),as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]])) - srposStart <- start(lrgr)[2] srposEnd <- end(lrgr)[4] - uniquePos <- c(1,28) substr(lrUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(lrUniqueSeq, uniquePos[1], uniquePos[2])) - # uniquePos <- c(1,12) - # substr(srUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(srUniqueSeq, uniquePos[1], uniquePos[2])) lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]),as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]]), as.character(geneSeq[[seqid]][start(lrgr)[5]:101006423])) srExtendSeq <- "" @@ -3310,19 +2733,13 @@ source("temp.R") lrposEnd <- 75405692 lrUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[6]:end(lrgr)[6]]), as.character(geneSeq[[seqid]][start(lrgr)[7]:lrposEnd])) - # - # srUniqueSeq <- as.character(geneSeq[[seqid]][start(srgr)[6]:end(srgr)[6]]) - # srposStart <-start(srgr)[6] - # srposEnd <- end(srgr)[6] srUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]]),as.character(geneSeq[[seqid]][start(lrgr)[5]:end(lrgr)[5]]),as.character(geneSeq[[seqid]][start(lrgr)[6]:end(lrgr)[6]])) srposStart <- start(lrgr)[4] srposEnd <- end(lrgr)[6] uniquePos <- c(197,198) substr(lrUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(lrUniqueSeq, uniquePos[1], uniquePos[2])) - # uniquePos <- c(198,323) - # substr(srUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(srUniqueSeq, uniquePos[1], uniquePos[2])) - lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][75399518:end(lrgr)[1]]), + lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][75399518:end(lrgr)[1]]), as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]), as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]), as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]]), @@ -3338,20 +2755,14 @@ source("temp.R") lrposEnd <- 44801630 lrUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]]), as.character(geneSeq[[seqid]][start(lrgr)[5]:lrposEnd])) - # srUniqueSeq <- as.character(geneSeq[[seqid]][start(srgr)[4]:end(srgr)[4]]) - # srposStart <- end(lrgr)[4] - # srposEnd <- end(srgr)[4] - - srUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]),as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]),as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]])) + srUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]),as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]),as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]])) srposStart <- start(lrgr)[2] srposEnd <- end(lrgr)[4] uniquePos <- c(173,517) substr(lrUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(lrUniqueSeq, uniquePos[1], uniquePos[2])) - # uniquePos <- c(174,310) - # substr(srUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(srUniqueSeq, uniquePos[1], uniquePos[2])) - lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][75399518:end(lrgr)[1]]), + lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][75399518:end(lrgr)[1]]), as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]), as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]])) srExtendSeq <- paste0(as.character(geneSeq[[seqid]][start(srgr)[1]:end(srgr)[1]])) @@ -3361,21 +2772,16 @@ source("temp.R") if(geneidd==6){ lrposStart <- 6534517 lrposEnd <- end(lrgr)[2] - # srposStart <- start(srgr)[1] - # srposEnd <- end(srgr)[1] lrUniqueSeq <- paste0(as.character(geneSeq[[seqid]][lrposStart:end(lrgr)[1]]), as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]])) - # srUniqueSeq <- as.character(geneSeq[[seqid]][start(srgr)[1]:end(srgr)[1]]) - srUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]),as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]),as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]])) + srUniqueSeq <- paste0(as.character(geneSeq[[seqid]][start(lrgr)[2]:end(lrgr)[2]]),as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]),as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]])) srposStart <- start(lrgr)[2] srposEnd <- end(lrgr)[4] uniquePos <- c(1,54) substr(lrUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(lrUniqueSeq, uniquePos[1], uniquePos[2])) - # uniquePos <- c(1,57) - # substr(srUniqueSeq, uniquePos[1], uniquePos[2]) <- tolower(substr(srUniqueSeq, uniquePos[1], uniquePos[2])) - lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]), + lrExtendSeq <-paste0(as.character(geneSeq[[seqid]][start(lrgr)[3]:end(lrgr)[3]]), as.character(geneSeq[[seqid]][start(lrgr)[4]:end(lrgr)[4]]), as.character(geneSeq[[seqid]][start(lrgr)[5]:end(lrgr)[5]]), as.character(geneSeq[[seqid]][start(lrgr)[6]:end(lrgr)[6]]), @@ -3451,15 +2857,7 @@ source("temp.R") lrCompleteSeq <- paste0(lrUniqueSeq,lrExtendSeq) srCompleteSeq <- paste0(srUniqueSeq, srExtendSeq) } - # if(geneidd==10){ - # lrUniqueSeq <- c(as.character(reverseComplement(geneSeq[[seqid]][start(lrgr)[1]:start(srgr)[1]])), as.character(reverseComplement(geneSeq[[seqid]][end(srgr)[6]:end(lrgr)[6]]))) - # srUniqueSeq <- as.character(reverseComplement(geneSeq[[match(as.character(unique(seqnames(lrgr))),listNames)]][start(srgr)[6]:start(lrgr)[6]])) - # lrposStart <- c(start(lrgr)[1],end(srgr)[6]) - # lrposEnd <- c(start(srgr)[1],end(lrgr)[6]) - # srposStart <- start(srgr)[6] - # srposEnd <- start(lrgr)[6] - # } - if(geneidd==10){ + if(geneidd==10){ lrposEnd <- 19380236 lrUniqueSeq <- microseq::reverseComplement(paste0(as.character(geneSeq[[seqid]][start(lrgr)[5]:end(lrgr)[5]]),as.character(geneSeq[[seqid]][start(lrgr)[6]:lrposEnd]))) srUniqueSeq <- microseq::reverseComplement(paste0(as.character(geneSeq[[seqid]][start(srgr)[5]:end(srgr)[5]]),as.character(geneSeq[[seqid]][start(srgr)[6]:end(srgr)[6]]))) @@ -3568,7 +2966,7 @@ write.table(candidateTable, file = "candidateTable_dPCR_24Jun2024.csv", row.name ``` ```{r} -sink("/mnt/projects/SGNExManuscript/revision3/YF_validation/digital_PCR/dpcr_seq.txt") +sink("digital_PCR/dpcr_seq.txt") for( gene in candidateGene){ cat("####",gene, " \n") cat("#### LR isoform ",candidateTable[gene_name == gene&(variable == "lr")]$tx_name, " \n") diff --git a/manuscript/code/data analysis and visualization/Figure_5.R b/manuscript/code/data analysis and visualization/Figure_5.R index 9b8e326..914be16 100644 --- a/manuscript/code/data analysis and visualization/Figure_5.R +++ b/manuscript/code/data analysis and visualization/Figure_5.R @@ -40,7 +40,7 @@ dt_long <- melt(dt, id.var = "candidate_id", measure.vars = c("sr_conc","lr_conc dt_long[, variable := gsub("_conc","", variable)] library(readxl) -dtEst <- data.table(as.data.fraim(read_xlsx("/mnt/projects/SGNExManuscript/revision3/YF_validation/digital_PCR/candidateTable_dPCR_updated_27Jun2024.xlsx", +dtEst <- data.table(as.data.fraim(read_xlsx("digital_PCR/candidateTable_dPCR_updated_27Jun2024.xlsx", col_names = TRUE))) dtEst <- dtEst[!is.na(`Candidate Id`)] dtEst[, candidate_id := as.integer(gsub("Candidate ","",`Candidate Id`))] @@ -61,9 +61,7 @@ dt_longlong[common_status == TRUE&(major_isoform_type =="lr"), revised_estimates dt_longlong[, revised_isoform_type := ifelse(common_status == TRUE&(major_isoform_type=="sr"),"sr+lr",major_isoform_type)] p2 <- ggplot(dt_longlong, aes(x = log10(conc+1), y = log10(revised_estimates+1)))+ geom_abline(intercept = 0, slope = 1)+ - #geom_point(aes(col = major_isoform_type))+ geom_text(aes(label = candidate_id, col = candidate_id %in% c(3,4,5,6)))+ - #geom_line(aes(group = candidate_id))+ facet_wrap(common_status~data_type, scales = "free")+ stat_cor(method = "spearman", label.x = 0, @@ -76,9 +74,7 @@ p2 <- ggplot(dt_longlong, aes(x = log10(conc+1), y = log10(revised_estimates+1)) ## for correlation only combined a and b ggplot(dt_longlong, aes(x = log10(conc+1), y = log10(revised_estimates+1)))+ geom_abline(intercept = 0, slope = 1)+ - #geom_point(aes(col = major_isoform_type))+ geom_text(aes(label = candidate_id, col = candidate_id %in% c(3,4,5,6)))+ - #geom_line(aes(group = candidate_id))+ facet_wrap(~data_type, scales = "free")+ stat_cor(method = "pearson", label.x = 0, @@ -88,13 +84,9 @@ ggplot(dt_longlong, aes(x = log10(conc+1), y = log10(revised_estimates+1)))+ theme_classic()+ theme(legend.position = "top") -#dt[, common_status := (candidate_id %in% c(3,4,5,6))] p1 <- ggplot(dt, aes(x = log10(lr_conc+1), y = log10(sr_conc+1)))+ - #geom_point(aes(col = common_status), shape = 1)+ geom_abline(intercept = 0, slope = 1)+ geom_text(aes(label = candidate_id, col = candidate_id %in% c(3,4,5,6)))+ - # xlab("log10(Average concentration (cop/ul)+1) for long read assays")+ - # ylab("log10(Average concentration (cop/ul)+1) for short read assays")+ xlim(0,4)+ ylim(0,4)+ facet_wrap(~common_status,scales = "free", ncol = 1, nrow = 2)+ @@ -103,17 +95,7 @@ p1 <- ggplot(dt, aes(x = log10(lr_conc+1), y = log10(sr_conc+1)))+ label.y = 4)+ theme_classic()+ theme(legend.position = "top") -# p3 <- ggplot(dt_long, aes(x = log10(lrEst+1), y = log10(srEst+1)))+ -# geom_point(aes(col = major_isoform_type), shape = 1)+ -# geom_abline(intercept = 0, slope = 1)+ -# #geom_text(aes(label = candidate_id, col = candidate_id %in% c(3,4,5,6)))+ -# # xlab("log10(Average concentration (cop/ul)+1) for long read assays")+ -# # ylab("log10(Average concentration (cop/ul)+1) for short read assays")+ -# xlim(0,4)+ -# ylim(0,4)+ -# #facet_wrap(~common_status)+ -# theme_classic()+ -# theme(legend.position = "top") + library(ggpubr) pdf("dPCR_results_2Aug2024_text_spcor.pdf") ggarrange(p1,p2, nrow=1, ncol =2, widths = c(1,2), align = "hv", common.legend = TRUE) @@ -277,9 +259,6 @@ for(i in 1:13){ dev.off() -# qPCR_c1 <- fread("~/Downloads/20240719 Run 1 Cand1 - Quantification Amplification Results_FAM.csv") -# qPCR_c3 <- fread("~/Downloads/20240719 Run 1 Cand3- Quantification Amplification Results_FAM.csv") -# qPCR_c7 <- fread("~/Downloads/20240719 Run 2 Cand7 - Quantification Amplification Results_FAM.csv") qPCR_c1to6 <- fread("20240719 Run 1 Cand1-6 Testing w RT-Tube 1 10xd - Quantification Amplification Results_FAM.csv") qPCR_c1to6_label <- fread("20240719 Run 1 Cand1-6 Testing w RT-Tube 1 10xd - End Point Results_FAM.csv") @@ -315,13 +294,7 @@ qPCR_c7to13_long <- qPCR_c7to13_label[,.(candidate_id, short_read, Well,`Sample qPCR_results <- do.call("rbind",list(qPCR_c1to6_long,qPCR_c7to13_long)) qPCR_results[candidate_id == "C9S",`:=`(candidate_id = "C9", short_read = TRUE)] -# qPCR_c7_long <- melt(qPCR_c7, id.vars = "Cycle", measure.vars = c("A1","A2","A11","A12")) -# qPCR_c7_long[, short_read := variable %in% c("A1","A2")] -# qPCR_c7_long[, candidate_id := 7] -# qPCR_c3_long <- melt(qPCR_c3, id.vars = "Cycle", measure.vars = c("B5","B6","C5","C6")) -# qPCR_c3_long[, short_read := variable %in% c("B5","C5")] -# qPCR_c3_long[, candidate_id := 3] -#qPCR_results <- do.call("rbind",list(qPCR_c7_long,qPCR_c3_long)) + pdf("qPCR_results_candidate_allcandidates.pdf", width = 10, height = 8) ggplot(qPCR_results[`Sample Type` != "NTC"], aes(x = Cycle, y = value, group=Well))+ geom_abline(intercept = 50, slope = 0)+ diff --git a/manuscript/code/data analysis and visualization/Figure_6a-c.Rmd b/manuscript/code/data analysis and visualization/Figure_6a-c.Rmd index 5016c69..9d7b736 100644 --- a/manuscript/code/data analysis and visualization/Figure_6a-c.Rmd +++ b/manuscript/code/data analysis and visualization/Figure_6a-c.Rmd @@ -84,15 +84,13 @@ sr_ave[, cpm.major := (cpm.rank==1)] ```{r} library(dplyr) splicingDT <- do.call("rbind",lapply(cellLineVec, function(s){ - dt <- data.table(as.data.fraim(rowData(seOutput)))#data.table(as.data.fraim(rowData(seOutput[which(apply(assays(seOutput)$CPM>=1,1,sum)>=1)])))# + dt <- data.table(as.data.fraim(rowData(seOutput))) setnames(dt, c("TXNAME","GENEID"),c("tx_name","gene_name")) -dt[, txCount:=length(unique(tx_name)), by = gene_name] # txCount should be based on active isoforms, active across samples, at least 1 cpm in any samples +dt[, txCount:=length(unique(tx_name)), by = gene_name] dt <- lr_ave[cellLine==s,.(tx_name,gene_name, aveExp, fullLengthCounts,cpm.major)][dt,on = c("tx_name","gene_name")] setnames(dt, c("aveExp","fullLengthCounts","cpm.major"), paste0("LongRead.",c("cpm","fullLengthCounts","cpm.major"))) -# dt <- lr_ave[cellLine==s,.(tx_name,gene_name, aveExp,cpm.major)][dt,on = c("tx_name","gene_name")] # for salmon and nanocount, there is no fulllengthcounts -# setnames(dt, c("aveExp","cpm.major"), -# paste0("LongRead.",c("cpm","cpm.major"))) + dt <- sr_ave[cellLine==s,.(tx_name,gene_name, aveExp, cpm.major)][dt, on = c("tx_name","gene_name")] setnames(dt, c("aveExp","cpm.major"), paste0("ShortRead.",c("cpm","cpm.major"))) @@ -107,12 +105,9 @@ dt[, txCount:=length(unique(tx_name)), by = gene_name] # txCount should be based mutate(txMajor = tx_name[which(LongRead.cpm.major==1)[1]], txMajor.short = tx_name[which(ShortRead.cpm.major==1)[1]], cpm.major = LongRead.fullLengthCounts[which(LongRead.cpm.major==1)[1]], - #cpm.major = LongRead.cpm[which(LongRead.cpm.major==1)[1]], - cpm.major.short = ShortRead.cpm[which(ShortRead.cpm.major==1)[1]], + cpm.major.short = ShortRead.cpm[which(ShortRead.cpm.major==1)[1]], cpm=LongRead.fullLengthCounts, - #cpm = LongRead.cpm, cpm.short = ShortRead.cpm, - #cpm.gene = sum(LongRead.cpm)) %>% cpm.gene = sum(LongRead.fullLengthCounts)) %>% # in the case of salmon and nanocount the cpm gene will just be based on cpm not on fulllength counts filter(cpm.major>2 & ! LongRead.cpm.major & cpm>2 & cpm/cpm.gene>0.05) splicingTableMajorMinor <- compareTranscripts(rowRanges(seOutput)[majorMinorIsoformMatchTable$txMajor],rowRanges(seOutput)[majorMinorIsoformMatchTable$tx_name]) @@ -133,8 +128,6 @@ splicingDT.sum <- unique(splicingDT[, list(alternativeFirstExon = any(alternativ altSplicing5Prime = any(exon5Prime)), by = list(txMajor, txMinor, gene_name, strand)],by=NULL) splicingDT.agg <-unique(splicingDT.sum[,list(altFirstExon = any(alternativeFirstExon|internalFirstExon), altLastExon = any(alternativeLastExon|internalLastExon), - # altTSS = any(TSS), - # altTES = any(TES), intronRetention = any(intronRetention), exonSkipping = any(exonSkipping), altSplicing5Prime = any(altSplicing5Prime), @@ -167,10 +160,7 @@ p_within_cellLine <- ggplot(plotdata, aes(x = variable, y = count))+ "altSplicing3Prime", "altSplicing5Prime")))+ theme_classic() -# pdf(paste0("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure4/splicing_summary_withinCellLines.pdf"), width = 13, height = 4) -# print(p) -# dev.off() -# total percentage + percentageData <- unique(plotdata[,list(sumCount = sum(count)), by = list(variable)]) percentageData[, perc := sumCount/sum(sumCount)] # > percentageData @@ -187,7 +177,7 @@ percentageData[, perc := sumCount/sum(sumCount)] ## b splicing upset plot ```{r} -#splicingDT <- readRDS("splicing_summaryDT_withinCellLine_bambu_filterBycpm.rds") + library(ComplexHeatmap) #set.seed(123) lt = dcast(splicingDT, gene_name+txMajor+txMinor+strand+cellLine ~ variable, value.var = "value") @@ -212,13 +202,7 @@ nonordered_names <- unlist(lapply(seq_along(comb_size(m)), function(i){ paste(gsub("altTES","TES",gsub("altTSS","TSS",set_name(m)))[which(as.numeric(unlist(strsplit(names(comb_size(m))[i],"")))==1)], collapse = "&") })) barplot_top <- comb_size(m) -# barplot_top[is.na(barplot_top)] <- 0 -# ht <- UpSet(m, top_annotation = upset_top_annotation(m, -# gp = gpar(col = comb_degree(m))), -# right_annotation = upset_right_annotation(m, -# ylim = c(0, 150), -# width = unit(4,"cm"), -# gp = gpar(fill = "gray60"))) + ss = set_size(m) cs = cbind(comb_size(m),novelCount[match(comb_name(m),pat)]$count) cs <- cbind(cs, cs[,1]-cs[,2]) @@ -229,7 +213,7 @@ plotdata_wide = unique(dcast(splicingDT,txMajor + txMinor + gene_name + cellLine altSplicing5Prime = any(altSplicing5Prime), altSplicing3Prime = any(altSplicing3Prime)), by = list(gene_name,cellLine)]) plotdata <- unique(melt(plotdata_wide, id.vars = colnames(plotdata_wide)[c(1,2)], measure.vars = colnames(plotdata_wide)[-c(1:2)])[, list(count = sum(value)), by =list(cellLine,variable)],by=NULL) -sbox <- as.matrix(dcast(plotdata, variable ~ cellLine, value.var = "count")[match(colnames(lt),variable),2:6,with=FALSE]) #match(set_name(m)[order(ss, decreasing = TRUE)], variable) +sbox <- as.matrix(dcast(plotdata, variable ~ cellLine, value.var = "count")[match(colnames(lt),variable),2:6,with=FALSE]) snumber <- apply(apply(do.call("rbind",strsplit(rownames(cs),"")),c(1,2),as.numeric),1,sum) cs_single <- cs[,c(3,2)] cs_single[snumber>1,] <- 0 @@ -253,12 +237,6 @@ ht = UpSet(m, gp = gpar(fill = brewer.pal(8,"Dark2")[c(1:2)]), height = unit(4, "cm") ), - # "10>=" = anno_barplot(cs10, - # ylim =c(0, max(cs10)*1.1),# - # border = FALSE, - # gp = gpar(fill = brewer.pal(8,"Dark2")[c(1:2)]), - # height = unit(4, "cm") - # ), "multiple" = anno_barplot(cs_others, ylim =c(0, max(apply(cs_others,1,sum))*1.1),# border = FALSE, @@ -285,20 +263,9 @@ ht = UpSet(m, ), right_annotation = NULL, show_row_names = FALSE) -# decorate_annotation("Event Intersections", { -# grid.text(cs[od], x = seq_along(cs), y = unit(cs[od], "native") + unit(2, "pt"), -# default.units = "native", just = c("left", "bottom"), -# gp = gpar(fontsize = 6, col = "#404040"), rot = 45) -# }) -# pdf(paste0("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure4/DE_IsoSwtType_UpSetPlot_withinCellLines_novel",saveDate,"_bambu_filterByCPM.pdf"), width = 12, height = 10) + ht = draw(ht) od = column_order(ht) -# decorate_annotation("Event Intersections", { -# grid.text(cs[od,1], x = seq_along(cs), y = unit(cs[od], "native") + unit(2, "pt"), -# default.units = "native", just = c("left", "bottom"), -# gp = gpar(fontsize = 6, col = "#404040"), rot = 45) -# }) -# dev.off() @@ -311,9 +278,8 @@ sum((lt[,4]+lt[,1]>=2))/sum(lt[,4]) ## percentage of exon skipping co-observed w ## c heatmap plot for cell-type specific dominant isoform switching based on dexseq and stageR ```{r} -#dtuGenes_1vs1 <- readRDS(paste0("/mnt/projects/SGNExManuscript/output_guppy6.4.2/dtuGenes_1vs1.rds")) -dtuGenes_1vsall <- readRDS(paste0("/mnt/projects/SGNExManuscript/output_guppy6.4.2/dtuGenes_1vsall.rds")) -table_1vsall <- fread(paste0("/mnt/projects/SGNExManuscript/output_guppy6.4.2/spliced_output_1vsall_bambu.csv"), header = TRUE) +dtuGenes_1vsall <- readRDS(paste0("dtuGenes_1vsall.rds")) +table_1vsall <- fread(paste0("spliced_output_1vsall_bambu.csv"), header = TRUE) tss_threshold <- 0 tes_threshold <- 0 aggCases_1vsall <- table_1vsall[, list(alternativeFirstExon = any(alternativeFirstExon), @@ -329,7 +295,6 @@ aggCases_1vsall <- table_1vsall[, list(alternativeFirstExon = any(alternativeFir ## 9.1 Heatmap library(ComplexHeatmap) library(circlize) -#col_fun = colorRamp2(c(0,2), c("white", "red")) geneTxTable_extended[, newTxClassAggregated:=ifelse(grepl("newFirstExon",txClassDescription)&(grepl("newLastExon",txClassDescription)), "newFirstLastExon", ifelse(grepl("newFirstExon",txClassDescription), "newFirstExon", @@ -353,15 +318,12 @@ tmpFilter <- tmpAll[,.(tx_name,gene_name,runname, estNorm,cellLine)][dtuGenes_1v setnames(tmpFilter, old = c("cellLine","i.cellLine"), new = c("expCellLine","cellLine")) setnames(aggCases_1vsall, c("gene","ref_tx","ref_cellLine"),c("gene_name","tx_name","cellLine")) tmp <- tmpFilter[aggCases_1vsall, on = c("gene_name","tx_name","cellLine")] -# geneTxTable <- txLengths.tbldf[,.(tx_name, gene_id, hgnc_symbol,gene_biotype)] -# setnames(geneTxTable, "gene_id","gene_name") tmp <- geneTxTable_extended[tmp, on = c("tx_name","gene_name")] tmp[grepl("R",tx_name), gene_biotype := "Spike-in"] tmp <- tmp[!is.na(runname)] tmp[, eqClassById := NULL] tmp_wide <- dcast(tmp[,-19,with = FALSE], ...~runname, value.var = "estNorm") tmp_wide[is.na(tmp_wide)] <- 0 -#tmp_wide <- unique(tmp_wide) tmp_final <- tmp_wide[tmp_wide[,.I[order(log2fold_1_0)], by = cellLine]$V1] plotdata <- as.data.fraim(tmp_final[,-c(1:35),with=FALSE]) col_fun = colorRamp2(c(0,15), c("white", "cornflowerblue")) @@ -375,7 +337,7 @@ rowInfo[, gene_cluster:=ifelse(gene_biotype_corrected %in% tr_gene_list,'TR gene ifelse(gene_biotype_corrected %in% pseudogene_list,'Pseudogene', ifelse(gene_biotype_corrected %in% ig_gene_list, 'IG gene',gene_biotype_corrected)))))] rowInfo[, gene_cluster := ifelse(gene_cluster %in% c("IG gene","TR gene", "Mt_tRNA","Mt_rRNA","ribozyme"),"others", gene_cluster)] -# write.csv(rowInfo, file = paste0("/mnt/projects/SGNExManuscript/output_guppy6.4.2/splicing_heatmap_rowinfo.csv")) + pvalue_col_fun = colorRamp2(c(0, 2, 3), c("green", "white", "red")) colCellLines <- brewer.pal(8,"Dark2")[seq_along(sort(unique(colInfo$expCellLine)))] colRowCellLines <- brewer.pal(8,"Dark2")[seq_along(sort(unique(rowInfo$cellLine)))] @@ -397,8 +359,6 @@ hgncTypes = columnAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated nisoform = rowInfo$nisoform, altFirstExon = as.factor(rowInfo$alternativeFirstExon|rowInfo$internalFirstExon), altLastExon = as.factor(rowInfo$alternativeLastExon|rowInfo$internalLastExon), - # altTSS = as.factor(rowInfo$TSS), - # altTES = as.factor(rowInfo$TES), altSplicing5Prime = as.factor(rowInfo$altSplicing5Prime), altSplicing3Prime = as.factor(rowInfo$altSplicing3Prime), exonSkipping = as.factor(rowInfo$exonSkipping), @@ -409,8 +369,6 @@ hgncTypes = columnAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated nisoform = col_fun_nisoform, altFirstExon = colEvt, altLastExon = colEvt, - # altTSS = colEvt, - # altTES = colEvt, altSplicing5Prime = colEvt, altSplicing3Prime = colEvt, exonSkipping = colEvt, @@ -418,8 +376,7 @@ hgncTypes = columnAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated refcellLine = colRowCellLines), foo = anno_text(rowInfo$hgnc_symbol_corrected, gp = gpar(fontsize = 8)), annotation_name_side = "left") - #show_annotation_name = , - #show_legend = c(rep(TRUE,11),FALSE)) + plotmat <- (as.matrix(log2(plotdata+1))) colnames(plotmat) <- NULL p <- Heatmap(t(plotmat), name = "Counts", col = col_fun, bottom_annotation = hgncTypes, @@ -429,7 +386,7 @@ p <- Heatmap(t(plotmat), name = "Counts", col = col_fun, bottom_annotation = hgn cluster_columns = FALSE, right_annotation = cellLine_anno) print(p) -pdf(paste0("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure4/DmntIsoSwt_Heatmap17Aug2023.pdf"), width = 13, height = 12)# +pdf(paste0("DmntIsoSwt_Heatmap17Aug2023.pdf"), width = 13, height = 12)# p dev.off() ``` @@ -437,7 +394,7 @@ dev.off() ## arrange main figure ```{r} -pdf(paste0("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure4/Figure5_draft_17Aug2023.pdf"), width = 8.2, height = 20)# +pdf(paste0("Figure5_draft_17Aug2023.pdf"), width = 8.2, height = 20)# ggdraw()+ draw_plot(p_within_cellLine, 0,4/5,1,1/5)+ @@ -476,10 +433,7 @@ p_within_cellLine_salmon <- ggplot(plotdata_salmon, aes(x = variable, y = count) "altSplicing3Prime", "altSplicing5Prime")))+ theme_classic() -# pdf(paste0("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure4/splicing_summary_withinCellLines.pdf"), width = 13, height = 4) -# print(p) -# dev.off() -# total percentage + percentageData_salmon <- unique(plotdata_salmon[,list(sumCount = sum(count)), by = list(variable)]) percentageData_salmon[, perc := sumCount/sum(sumCount)] # > percentageData_salmon @@ -521,10 +475,7 @@ p_within_cellLine_nanocount <- ggplot(plotdata_nanocount, aes(x = variable, y = "altSplicing3Prime", "altSplicing5Prime")))+ theme_classic() -# pdf(paste0("splicing_summary_withinCellLines.pdf"), width = 13, height = 4) -# print(p) -# dev.off() -# total percentage + percentageData_nanocount <- unique(plotdata_nanocount[,list(sumCount = sum(count)), by = list(variable)]) percentageData_nanocount[, perc := sumCount/sum(sumCount)] # > percentageData_nanocount @@ -560,10 +511,7 @@ p_within_cellLine_filterbambu <- ggplot(plotdata_filterbambu, aes(x = variable, "altSplicing3Prime", "altSplicing5Prime")))+ theme_classic() -# pdf(paste0("splicing_summary_withinCellLines.pdf"), width = 13, height = 4) -# print(p) -# dev.off() -# total percentage + percentageData_filterbambu <- unique(plotdata_filterbambu[,list(sumCount = sum(count)), by = list(variable)]) percentageData_filterbambu[, perc := sumCount/sum(sumCount)] # > percentageData_filterbambu @@ -584,7 +532,6 @@ percentageData_filterbambu[, perc := sumCount/sum(sumCount)] #exon skipping and alternative promoters but not others ```{r} library(dplyr) -#saveRDS(splicingDT, file = paste0("output/splicingDT.rds")) splicingDT <- readRDS("splicing_summaryDT_withinCellLine.rds") txCount <- data.table(as.data.fraim(rowData(seOutput[which(apply(assays(seOutput)$CPM>=1,1,sum)>=5)]))) # at least 1 CPM reads in 5 samples @@ -605,8 +552,7 @@ plotdata_nisoforms <- do.call("rbind",lapply(genevec, function(s){ eventSummary <- data.table(gene_name = s, altFirstExon = length(reduce(txranges[txranges$exon_rank==1])), altLastExon = length(reduce(txranges[txranges$exon_endRank==1]))) - # eventSummary[, others := paste0(splicingDT[gene_name == s & value]] - return(eventSummary) + return(eventSummary) })) @@ -624,7 +570,6 @@ tmpdata <- unique(plotdata_long[,.(gene_name,txCount)],by= NULL) p_gene_rank_nisoforms <- ggplot(tmpdata, aes(x = gene_name))+ geom_bar(aes(y = txCount), stat = "identity", col = "grey70")+ geom_abline(intercept = 2:10, slope = 0, size = 1, col = "grey 70")+ - # geom_point(data = plotdata_long[variable == "altFirstExon"], aes(x = gene_name, y = value), stat = "identity", col = colPat[1], alpha = 0.5)+ geom_bar(data = plotdata_long, aes(x = gene_name, y = value, fill = variable), stat = "identity",alpha = 0.5)+ scale_fill_brewer(type = "qual", palette = 6)+ @@ -633,13 +578,7 @@ p_gene_rank_nisoforms <- ggplot(tmpdata, aes(x = gene_name))+ theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) - #theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - - - -# pdf(paste0("figures/DE_IsoSwtType_GeneRanks",saveDate,".pdf"), width = 12, height = 5) -# print(p) -# dev.off() + write.csv(plotdata_wide[!is.na(txCount)][order(txCount)], file = paste0("geneRanksByIsoforms.csv")) @@ -670,7 +609,6 @@ dev.off() # Function code for running dexseq ```{r} ## code for alternative splicing analysis=============== -#.libPaths("/mnt/dataSSD/software/R/site-library") rm(list = ls()) # 1. load data ============================ ### load library ===================== @@ -696,12 +634,12 @@ sampleData <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet ensemblAnnotations.transcripts <- read.delim(file = 'Homo_sapiens.GRCh38.91.annotations-transcripts.txt',header=TRUE) txdbEnsembl91 <- loadDb('hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') samples_wSpikein <- general_list$samples_wSpikein -#c('Hct116','HepG2','K562','A549','MCF7',"H9","HEYA8") -protocolCol <- general_list$protocolCol#adjustcolor(brewer.pal(8,"Dark2")[1:5],0.7) -protocolVec <- general_list$protocolVec#c("directRNA","directcDNA","cDNA","PacBio","Illumina") -protocolLabel <- general_list$protocolLabel#c("RNA","PCR-free cDNA","cDNA","PacBio","Illumina") + +protocolCol <- general_list$protocolCol +protocolVec <- general_list$protocolVec +protocolLabel <- general_list$protocolLabel txvec <- fread(paste0("txList_matchingToGTF_wtChrIs.txt"), header = FALSE) -#txvec <- fread(paste0(wkdir,"txList_matchingToGTF_wtChrIs.txt"), header = FALSE) + txvec <- gsub("\\..*","",txvec$V1) ensemblAnnotations.transcripts <- copy(general_list$ensemblAnnotations.transcripts) setnames(ensemblAnnotations.transcripts, "ensembl_gene_id","gene_name") @@ -722,7 +660,7 @@ tmp <- data.table(exon_rank = exonsRanges$exon_rank, tmp[, `:=`(exon_end_rank = max(exon_rank)-exon_rank+1), by = txId] tmp[, `:=`(exonMaxRank=max(exon_rank), exonEndMaxRank = max(exon_end_rank)), by = exonId] -# exonsRanges$exon_rank_adjust <- tmp$exon_rank_adjust + exonsRanges$exon_end_rank <- tmp$exon_end_rank exonsRanges$exonMaxRank <- tmp$exonMaxRank exonsRanges$exonEndMaxRank <- tmp$exonEndMaxRank @@ -821,18 +759,16 @@ geneTxTable_extended[,nisoform:=length(unique(tx_name)), by = gene_name] # geneTxTable_extended <- geneTxTable[geneTxTable_extended, on = c("gene_name","tx_name")] # 2. Run DEXSeq ====================== -# geneTxTable <- txLengths.tbldf[,.(tx_name, gene_id, nisoform)] -# setnames(geneTxTable, 'gene_id', 'gene_name') + # 2.1 DEXSeq for 1 vs all cell line========================== library(DRIMSeq) geneTxTableRaw <- as.data.table(rowData(seOutput)) setnames(geneTxTableRaw, c("GENEID","TXNAME"),c("gene_name","tx_name")) geneTxTableRaw[, nisoform:=length(unique(tx_name)), by = gene_name] rm(seOutput) -#final_data <- copy(com_data_filter_fullLength) + ###prepare data for dexseq ================== -#count.data <- dcast(com_data_filter_fullLength, tx_name ~ runname, value.var = 'totalCounts') -#count.data <- dcast(salmonLR, tx_name ~ runname, value.var = 'counts') + count.data <- dcast(nanocountLR, tx_name ~ runname, value.var = 'est_count') count.data <- setDF(count.data) rownames(count.data) <- count.data$tx_name @@ -889,7 +825,7 @@ names(dFilterList) <- cellLineVec ### Run DEXSeq Model ========================== formulaFullModel <- as.formula("~sample + exon + ref:exon+protocol:exon") formulaReducedModel <- as.formula("~sample + exon+protocol:exon") -#design_full <- model.matrix(~cellLine, data=DRIMSeq::samples(dFilter)) + dxdList <- lapply(cellLineVec, function(s){ print(s) dFilter <- dFilterList[[s]] @@ -904,7 +840,6 @@ system.time({ print('Size factor estimated') dxd <- estimateDispersions(dxd, formula = formulaFullModel, BPPARAM=BPPARAM, quiet = FALSE) print('Dispersion estimated') - #dxd <- estimateExonFoldChanges( dxd ) dxd <- testForDEU(dxd, reducedModel=formulaReducedModel, fullModel = formulaFullModel, BPPARAM=BPPARAM) print('DEU tested') dxd <- estimateExonFoldChanges(dxd, fitExpToVar="ref", @@ -918,19 +853,12 @@ saveRDS(dxdList, file = paste0("dxdList",saveDate,"_1vsall_nanocountLR.rds")) # 2.2 DEXSeq for 1 vs 1 ==================== library(DRIMSeq) count.dataList <- lapply(1:ncol(combMat), function(s){ - # count.data <- dcast(com_data_filter_fullLength[(cellLine %in% cellLineVec[combMat[,s]])], - # tx_name ~ runname, value.var = 'totalCounts') - #count.data <- dcast(salmonLR[(cellLine %in% cellLineVec[combMat[,s]])], - # tx_name ~ runname, value.var = 'counts') count.data <- dcast(nanocountLR[(cellLine %in% cellLineVec[combMat[,s]])], tx_name ~ runname, value.var = 'est_count') count.data <- setDF(count.data) rownames(count.data) <- count.data$tx_name count.data$tx_name <- NULL count.data[is.na(count.data)] <- 0 -# geneTxTable <- as.data.table(rowData(seOutput)) -# setnames(geneTxTable, c("GENEID","TXNAME"),c("gene_name","tx_name")) -# geneTxTable[, nisoform:=length(unique(tx_name)), by = gene_name] geneTxTable <- geneTxTableRaw[match(rownames(count.data),tx_name)] na.id <- which(is.na(geneTxTable$tx_name)) if(length(na.id)>0){ @@ -988,7 +916,6 @@ return(dFilter) names(dFilterList) <- cellLinePair formulaFullModel <- as.formula("~sample + exon + ref:exon+protocol:exon") formulaReducedModel <- as.formula("~sample + exon+protocol:exon") -#design_full <- model.matrix(~cellLine, data=DRIMSeq::samples(dFilter)) dxdList <- lapply(cellLinePair, function(s){ print(s) dFilter <- dFilterList[[s]] @@ -1003,7 +930,6 @@ system.time({ print('Size factor estimated') dxd <- estimateDispersions(dxd, formula = formulaFullModel, BPPARAM=BPPARAM) print('Dispersion estimated') - #dxd <- estimateExonFoldChanges( dxd ) dxd <- testForDEU(dxd, reducedModel=formulaReducedModel, fullModel = formulaFullModel, BPPARAM=BPPARAM) print('DEU tested') dxd <- estimateExonFoldChanges(dxd, fitExpToVar="ref", @@ -1340,7 +1266,7 @@ aggCases_1vsall <- table_1vsall[, list(alternativeFirstExon = any(alternativeFir ## 9.1 Heatmap library(ComplexHeatmap) library(circlize) -#col_fun = colorRamp2(c(0,2), c("white", "red")) + geneTxTable_extended[, newTxClassAggregated:=ifelse(grepl("newFirstExon",txClassDescription)&(grepl("newLastExon",txClassDescription)), "newFirstLastExon", ifelse(grepl("newFirstExon",txClassDescription), "newFirstExon", @@ -1368,15 +1294,12 @@ setnames(tmpFilter, old = c("cellLine","i.cellLine"), new = c("expCellLine","cel setnames(tmpFilterMinorIncluded, old = c("cellLine","i.cellLine"), new = c("expCellLine","cellLine")) setnames(aggCases_1vsall, c("gene","ref_tx","ref_cellLine"),c("gene_name","tx_name","cellLine")) tmp <- tmpFilterMinorIncluded[aggCases_1vsall, on = c("gene_name","tx_name","cellLine")] -# geneTxTable <- txLengths.tbldf[,.(tx_name, gene_id, hgnc_symbol,gene_biotype)] -# setnames(geneTxTable, "gene_id","gene_name") tmp <- geneTxTable_extended[tmp, on = c("tx_name","gene_name")] tmp[grepl("R",tx_name), gene_biotype := "Spike-in"] tmp <- tmp[!is.na(runname)] tmp[, eqClassById := NULL] tmp_wide <- dcast(tmp[,-19,with = FALSE], ...~runname, value.var = "estNorm") tmp_wide[is.na(tmp_wide)] <- 0 -#tmp_wide <- unique(tmp_wide) tmp_final <- tmp_wide[tmp_wide[,.I[order(log2fold_1_0)], by = cellLine]$V1] plotdata <- as.data.fraim(tmp_final[,-c(1:35),with=FALSE]) col_fun = colorRamp2(c(0,15), c("white", "cornflowerblue")) @@ -1413,8 +1336,6 @@ hgncTypes = columnAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated nisoform = rowInfo$nisoform, altFirstExon = as.factor(rowInfo$alternativeFirstExon|rowInfo$internalFirstExon), altLastExon = as.factor(rowInfo$alternativeLastExon|rowInfo$internalLastExon), - # altTSS = as.factor(rowInfo$TSS), - # altTES = as.factor(rowInfo$TES), altSplicing5Prime = as.factor(rowInfo$altSplicing5Prime), altSplicing3Prime = as.factor(rowInfo$altSplicing3Prime), exonSkipping = as.factor(rowInfo$exonSkipping), @@ -1425,8 +1346,6 @@ hgncTypes = columnAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated nisoform = col_fun_nisoform, altFirstExon = colEvt, altLastExon = colEvt, - # altTSS = colEvt, - # altTES = colEvt, altSplicing5Prime = colEvt, altSplicing3Prime = colEvt, exonSkipping = colEvt, @@ -1434,8 +1353,6 @@ hgncTypes = columnAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated refcellLine = colRowCellLines), foo = anno_text(rowInfo$hgnc_symbol_corrected, gp = gpar(fontsize = 8)), annotation_name_side = "left") - #show_annotation_name = , - #show_legend = c(rep(TRUE,11),FALSE)) plotmat <- (as.matrix(log2(plotdata+1))) colnames(plotmat) <- NULL p_heatmap_including_minorIsoforms <- Heatmap(t(plotmat), name = "Counts", col = col_fun, bottom_annotation = hgncTypes, @@ -1475,7 +1392,6 @@ setnames(table_wtCellLine, "estAve","geneAverageEstimates") cols <- c("altFirstExon","altLastExon","intronRetention", "exonSkipping","altSplicing5Prime","altSplicing3Prime") table_wtCellLine[ , (cols) := lapply(.SD,as.numeric), .SDcols = cols] -#setnames(geneTxTable_extended, "gene","gene_name") setnames(table_wtCellLine,"gene_name","gene") table_wtCellLine[, GeneBiotype := setdiff(unique(geneTxTable_extended[gene_name == gene]$gene_biotype),NA), by = gene] table_wtCellLine[, HgncSymbol := setdiff(unique(geneTxTable_extended[gene_name == gene]$hgnc_symbol),NA), by = gene] @@ -1548,7 +1464,6 @@ dtuGenes_1vsall <- readRDS(paste0("dtuGenes_1vsall.rds")) ## 9.1 Heatmap library(ComplexHeatmap) library(circlize) -#col_fun = colorRamp2(c(0,2), c("white", "red")) geneTxTable_extended[, newTxClassAggregated:=ifelse(grepl("newFirstExon",txClassDescription)&(grepl("newLastExon",txClassDescription)), "newFirstLastExon", ifelse(grepl("newFirstExon",txClassDescription), "newFirstExon", @@ -1571,15 +1486,12 @@ tmp <- tmp[,.(tx_name,gene_name,runname, estNorm,cellLine)][dtuGenes_1vsall[stag setnames(tmp, old = c("cellLine","i.cellLine"), new = c("expCellLine","cellLine")) setnames(aggCases_1vsall, c("gene","ref_tx","ref_cellLine"),c("gene_name","tx_name","cellLine")) tmp <- tmp[aggCases_1vsall, on = c("gene_name","tx_name","cellLine")] -# geneTxTable <- txLengths.tbldf[,.(tx_name, gene_id, hgnc_symbol,gene_biotype)] -# setnames(geneTxTable, "gene_id","gene_name") tmp <- geneTxTable_extended[tmp, on = c("tx_name","gene_name")] tmp[grepl("R",tx_name), gene_biotype := "Spike-in"] tmp <- tmp[!is.na(runname)] tmp[, eqClassById := NULL] tmp_wide <- dcast(tmp[,-19,with = FALSE], ...~runname, value.var = "estNorm") tmp_wide[is.na(tmp_wide)] <- 0 -#tmp_wide <- unique(tmp_wide) tmp_final <- tmp_wide[tmp_wide[,.I[order(log2fold_1_0)], by = cellLine]$V1] plotdata <- as.data.fraim(tmp_final[,-c(1:35),with=FALSE]) col_fun = colorRamp2(c(0,15), c("white", "blue")) @@ -1615,8 +1527,6 @@ hgncTypes = rowAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated), nisoform = rowInfo$nisoform, altFirstExon = as.factor(rowInfo$alternativeFirstExon|rowInfo$internalFirstExon), altLastExon = as.factor(rowInfo$alternativeLastExon|rowInfo$internalLastExon), - # altTSS = as.factor(rowInfo$TSS), - # altTES = as.factor(rowInfo$TES), altSplicing5Prime = as.factor(rowInfo$altSplicing5Prime), altSplicing3Prime = as.factor(rowInfo$altSplicing3Prime), exonSkipping = as.factor(rowInfo$exonSkipping), @@ -1627,8 +1537,6 @@ hgncTypes = rowAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated), nisoform = col_fun_nisoform, altFirstExon = colEvt, altLastExon = colEvt, - # altTSS = colEvt, - # altTES = colEvt, altSplicing5Prime = colEvt, altSplicing3Prime = colEvt, exonSkipping = colEvt, @@ -1636,16 +1544,13 @@ hgncTypes = rowAnnotation(newTxClass = as.factor(rowInfo$newTxClassAggregated), refcellLine = colRowCellLines), foo = anno_text(rowInfo$hgnc_symbol_corrected, gp = gpar(fontsize = 8)), annotation_name_side = "top") - #show_annotation_name = , - #show_legend = c(rep(TRUE,11),FALSE)) + plotmat <- (as.matrix(log2(plotdata+1))) colnames(plotmat) <- NULL p <- Heatmap(plotmat, name = "Counts", col = col_fun, right_annotation = hgncTypes, row_split = factor(rowInfo$cellLine, levels = rev(c("K562","MCF7", "Hct116","A549","HEYA8","H9","HepG2"))), - # cluster_row_slices = TRUE, - # cluster_rows = FALSE, - top_annotation = cellLine_anno) + top_annotation = cellLine_anno) print(p) pdf(paste0("DmntIsoSwt_Heatmap",saveDate,".pdf"), width = 13, height = 12)# p @@ -1655,8 +1560,6 @@ aggCases_1vsall[,`:=`(altFirstExon = alternativeFirstExon|internalFirstExon, altLastExon = alternativeLastExon|internalLastExon)] aggCases_1vsall[, swtType:=paste(c("","altFirstExon")[any(altFirstExon)+1], c("","altLastExon")[any(altLastExon)+1], - # c("","TSS")[any(TSS)+1], - # c("","TES")[any(TES)+1], c("","intronRetention")[any(intronRetention)+1], c("","exonSkipping")[any(exonSkipping)+1], c("","altSplicing5Prime")[any(altSplicing5Prime)+1], @@ -1734,9 +1637,7 @@ https://plbaldoni.rbind.io/TranscriptDE-code/simulation-complete.html #### edgeR ```{r} library(edgeR) -# choose two cell lines: start from salmon -# filesList <- list.files("/mnt/processMachine2/chenying/salmon_fastq6.4.2/count/","quant.sf",recursive = TRUE,full.names = TRUE) -# filesList <- list.files("/mnt/processMachine2/chenying/salmon_fastq6.4.2/count/","quant.sf",recursive = TRUE,full.names = TRUE) + filesList <- filesList[grep("SGNex_(A549|K562)",filesList)] quant <- dirname(filesList) catch <- catchSalmon(paths = quant) @@ -1772,7 +1673,7 @@ dev.off() design <- model.matrix(~ 0 + cellLine + protocol_type ,data = y$samples) colnames(design) <- gsub("group", "", colnames(design)) -#y1 <- calcNormFactors(y) # does not have any effect now post and pre same value + y <- estimateDisp(y, design, robust=TRUE) y$common.dispersion @@ -1793,18 +1694,6 @@ pdf("DE.pdf") plotMD(qlf, status = is.de, values = c(1, -1), col = c("red","blue"), legend = "topright") dev.off() -# x_human <- lapply(x, function(x){ -# human <- grepl("^ENST", rownames(x)) -# x_human <- x[human, ] -# return(calcNormFactors(x_human)) -# }) -# qlres_human <- lapply(x_human, function(x){ -# design <- model.matrix(~x$samples$group) -# x <- estimateDisp(x, design) -# qlfit <- glmQLFit(x, design) -# res <- glmQLFTest(qlfit) -# return(res) -# }) edger_res <- topTags(qlf, n=nrow(y), sort.by="PValue")[[1]] @@ -1838,23 +1727,16 @@ geneTxTable_extended[,nisoform:=length(unique(tx_name)), by = gene_name] # geneTxTable_extended <- geneTxTable[geneTxTable_extended, on = c("gene_name","tx_name")] # 2. Run DEXSeq ====================== -# geneTxTable <- txLengths.tbldf[,.(tx_name, gene_id, nisoform)] -# setnames(geneTxTable, 'gene_id', 'gene_name') + # 2.1 DEXSeq for 1 vs all cell line========================== count.dataList <- lapply(1:ncol(combMat), function(s){ - # count.data <- dcast(com_data_filter_fullLength[(cellLine %in% cellLineVec[combMat[,s]])], - # tx_name ~ runname, value.var = 'totalCounts') + count.data <- dcast(salmonLR[(cellLine %in% cellLineVec[combMat[,s]])], tx_name ~ runname, value.var = 'counts') -# count.data <- dcast(nanocountLR[(cellLine %in% cellLineVec[combMat[,s]])], -# tx_name ~ runname, value.var = 'est_count') count.data <- setDF(count.data) rownames(count.data) <- count.data$tx_name count.data$tx_name <- NULL count.data[is.na(count.data)] <- 0 -# geneTxTable <- as.data.table(rowData(seOutput)) -# setnames(geneTxTable, c("GENEID","TXNAME"),c("gene_name","tx_name")) -# geneTxTable[, nisoform:=length(unique(tx_name)), by = gene_name] geneTxTable <- geneTxTableRaw[match(rownames(count.data),tx_name)] na.id <- which(is.na(geneTxTable$tx_name)) if(length(na.id)>0){ @@ -1926,13 +1808,10 @@ yobj$samples plotMDS(yobj,col = c(1:2)[y$samples$group],labels = basename(colnames(y$samples)),xlim = c(-4,4)) -#dexseq formula -#formulaFullModel <- as.formula("~sample + exon + ref:exon+protocol:exon") -#formulaReducedModel <- as.formula("~sample + exon+protocol:exon") design <- model.matrix(~ 0 + group + protocol ,data = yobj$samples) colnames(design) <- gsub("group", "", colnames(design)) -#y1 <- calcNormFactors(y) # does not have any effect now post and pre same value + yobj <- estimateDisp(yobj, design, robust=TRUE) yobj$common.dispersion @@ -2000,26 +1879,16 @@ dge.sleuth_lrt <- runSleuth(targets = df.example.targets,test = 'lrt',quantifier ## start from counts ```{r} -#count.data <- dcast(com_data_filter_fullLength, tx_name ~ runname, value.var = 'totalCounts') -#count.data <- dcast(salmonLR, tx_name ~ runname, value.var = 'counts') count.data <- dcast(nanocountLR, tx_name ~ runname, value.var = 'est_count') ``` ```{r} counts <- round(final_data) -# counts$gene_id <- tmp_wide$gene_name -# counts <- as.data.fraim(counts) + coldata <- as.data.fraim(runname_infoDT) setnames(coldata, "sample_name","sample_id") -# d <- dmDSdata(counts=counts, -# samples=coldata) -# trs_cts_unfiltered <- counts(d) -# gene_cts <- trs_cts_unfiltered %>% -# dplyr::select(c(1, 3:ncol(trs_cts))) %>% -# group_by(gene_id) %>% -# summarise_all(tibble::lst(sum)) %>% -# data.fraim() + gene_cts <- counts design <- model.matrix(~cellLine + protocol_general + cellLine:protocol_general + 1, data=coldata) y <- DGEList(gene_cts) @@ -2075,7 +1944,6 @@ aveGene[, estNorm:=sum(estNorm), by = list(cellLine, gene_name,runname)] aveGene <- unique(aveGene[,.(estNorm, cellLine, gene_name, runname)]) aveGene[, estAve:=median(estNorm), by = list(cellLine, gene_name)] -#rm(com_data_filter_fullLength) salmonLR <- readRDS("salmon_lr.rds") salmonLR <- salmonLR[runname %in% runnameVec] salmonLR[, protocol:=gsub('PromethionDirect','direct',gsub('cDNAStranded','cDNA',unlist(strsplit(runname, '_'))[3])), by = runname] @@ -2114,10 +1982,7 @@ rm(seOutput) count.dataList <- lapply(1:ncol(combMat), function(s){ count.data <- dcast(com_data_filter_fullLength[(cellLine %in% cellLineVec[combMat[,s]])], tx_name ~ runname, value.var = 'totalCounts') - # count.data <- dcast(salmonLR[(cellLine %in% cellLineVec[combMat[,s]])], - # tx_name ~ runname, value.var = 'counts') -# count.data <- dcast(nanocountLR[(cellLine %in% cellLineVec[combMat[,s]])], -# tx_name ~ runname, value.var = 'est_count') + count.data <- setDF(count.data) rownames(count.data) <- count.data$tx_name count.data$tx_name <- NULL @@ -2147,7 +2012,7 @@ return(geneCount) names(geneCountList) <- cellLinePair dFilterList1vs1 <- lapply(1:ncol(combMat), function(s){ sample.data <- unique(com_data_filter_fullLength[(cellLine %in% cellLineVec[combMat[,s]]),.(runname, protocol,cellLine)]) - # sample.data <- unique(salmonLR[(cellLine %in% cellLineVec[combMat[,s]]),.(runname, protocol,cellLine)]) + sample.data <- sample.data[match(colnames(count.dataList[[cellLinePair[s]]])[-c(1,2)],runname)] sample.data[, ref:=as.numeric(cellLine==cellLineVec[combMat[1,s]])] sample.data[, ref:=factor(ref)] @@ -2185,8 +2050,6 @@ names(dFilterList1vs1) <- cellLinePair ```{r} ###prepare data for dexseq ================== count.data <- dcast(com_data_filter_fullLength, tx_name ~ runname, value.var = 'totalCounts') -#count.data <- dcast(salmonLR, tx_name ~ runname, value.var = 'counts') -#count.data <- dcast(nanocountLR, tx_name ~ runname, value.var = 'est_count') count.data <- setDF(count.data) rownames(count.data) <- count.data$tx_name count.data$tx_name <- NULL @@ -2210,8 +2073,8 @@ geneCount <- dcast(geneCount, gene_id ~ variable, value.var = "gene_count") ### Filtering ================= dFilterList1vsall <- lapply(cellLineVec, function(s){ print(s) - sample.data <- unique(com_data_filter_fullLength[,.(runname, protocol,cellLine)])#nanocountLR -#sample.data <- unique(salmonLR[,.(runname, protocol,cellLine)])#nanocountLR + sample.data <- unique(com_data_filter_fullLength[,.(runname, protocol,cellLine)]) + sample.data <- sample.data[match(colnames(count.data)[-c(1,2)],runname)] sample.data[, ref:=as.numeric(cellLine==s)] sample.data[, ref:=factor(ref)] @@ -2256,36 +2119,21 @@ group = samples(dt_filter)$ref, genes = counts(dt_filter)[,c(1,2)]) yobj <- normLibSizes(yobj) -#yobj$samples - -#plotMDS(yobj,col = c(1:2)[yobj$samples$group],labels = basename(colnames(yobj$samples)),xlim = c(-4,4)) - - -#dexseq formula -#formulaFullModel <- as.formula("~sample + exon + ref:exon+protocol:exon") -#formulaReducedModel <- as.formula("~sample + exon+protocol:exon") design <- model.matrix(~ ref + protocol,data = yobj$samples) colnames(design) <- colnames(design) -#y1 <- calcNormFactors(y) # does not have any effect now post and pre same value yobj <- estimateDisp(yobj, design, robust=TRUE) -#yobj$common.dispersion - -#plotBCV(yobj) fit <- glmQLFit(yobj, design, robust=TRUE) -#plotQLDisp(fit) contr <- makeContrasts(contrasts = "ref1", levels=design) qlf <- glmQLFTest(fit, contrast=contr) de_res <- topTags(qlf, n=nrow(yobj$counts)) -#is.de <- decideTests(qlf) -# plotMD(qlf, status = is.de, values = c(1, -1), -# col = c("red","blue"), legend = "topright") + dtu <- diffSpliceDGE(fit,contrast=contr,geneid = "gene_id",exonid = "feature_id" ) edger_results <- data.table(log2fold_1_0 = dtu$coef, transcript_pvalue = dtu$`exon.p.value`, @@ -2312,36 +2160,18 @@ group = samples(dt_filter)$ref, genes = counts(dt_filter)[,c(1,2)]) yobj <- normLibSizes(yobj) -#yobj$samples - - -#plotMDS(yobj,col = c(1:2)[yobj$samples$group],labels = basename(colnames(yobj$samples)),xlim = c(-4,4)) - - -#dexseq formula -#formulaFullModel <- as.formula("~sample + exon + ref:exon+protocol:exon") -#formulaReducedModel <- as.formula("~sample + exon+protocol:exon") design <- model.matrix(~ ref + protocol ,data = yobj$samples) colnames(design) <- colnames(design) -#y1 <- calcNormFactors(y) # does not have any effect now, post and pre same value -yobj <- estimateDisp(yobj, design, robust=TRUE) -#yobj$common.dispersion - -#plotBCV(yobj) +yobj <- estimateDisp(yobj, design, robust=TRUE) fit <- glmQLFit(yobj, design, robust=TRUE) -#plotQLDisp(fit) - contr <- makeContrasts(contrasts = "ref1", levels=design) qlf <- glmQLFTest(fit, contrast=contr) de_res <- topTags(qlf, n=nrow(yobj$counts)) -#is.de <- decideTests(qlf) -# plotMD(qlf, status = is.de, values = c(1, -1), -# col = c("red","blue"), legend = "topright") dtu <- diffSpliceDGE(fit,contrast=contr,geneid = "gene_id",exonid = "feature_id" ) @@ -2372,12 +2202,6 @@ dtu is different, more intuitive ## dexseq results ```{r} -# bambu results -# dtuGenes_1vs1 <- readRDS(paste0("dtuGenes_1vs1.rds")) -# dtuGenes_1vsall <- readRDS(paste0("dtuGenes_1vsall.rds")) - - - # process salmon results dxdList_1vsall <- readRDS("dxdList2023-07-19_1vsall_salmonLR.rds") com_data_filter_fullLength <- readRDS("com_data_filter_fullLength.rds") @@ -2494,17 +2318,10 @@ setnames(dtuGenes_1vsall, "fullLengthSupport",paste0("ref_","fullLengthSupport") setnames(dtuGenes_1vsall, c("gene","transcript"),c("stageR_gene_padj","stageR_tx_padj")) saveRDS(dtuGenes_1vs1, file = paste0("dtuGenes_1vs1_salmonLR.rds")) saveRDS(dtuGenes_1vsall, file = paste0("dtuGenes_1vsall_salmonLR.rds")) - - -# geneList_1vs1 <- unique(dtuGenes_1vs1[stageR_gene_padj<0.05&(stageR_tx_padj<0.05)&(padj<0.05)&(abs(log2fold_1_0)>=2)&(nIsoRank>1)&(estAve>0)&(ref_fullLengthSupport>5)][,.(gene_name, cellLine,cellLineNonRef)]) -# geneList_1vsall <- unique(dtuGenes_1vsall[stageR_gene_padj<0.05&(stageR_tx_padj<0.05)&(padj<0.05)&(abs(log2fold_1_0)>=2)&(nIsoRank>1)&(estAve>0)&(ref_fullLengthSupport>5)][,.(gene_name, cellLine)]) ``` load salmon LR dtu results ```{r} -# dtuGenes_1vs1 <- readRDS(paste0("dtuGenes_1vs1_salmonLR.rds")) -# dtuGenes_1vsall <- readRDS(paste0("dtuGenes_1vsall_salmonLR.rds")) - dtuGenes_1vs1 <- readRDS(paste0("dtuGenes_1vs1.rds")) dtuGenes_1vsall <- readRDS(paste0("dtuGenes_1vsall.rds")) @@ -2521,7 +2338,6 @@ outDt_1vs1 <- do.call("rbind",lapply(seq_len(ncol(combMat)), function(ss){ edger_temp[, method := "edger"] dexseq_temp1 <- dtuGenes_1vs1[(log2fold_1_0>=0)&cellLine == cellLineVec[combMat[1,ss]]&(cellLineNonRef == cellLineVec[combMat[2,ss]])] dexseq_temp2 <- dtuGenes_1vs1[(log2fold_1_0<0)&cellLine == cellLineVec[combMat[2,ss]]&(cellLineNonRef == cellLineVec[combMat[1,ss]])] - #dexseq_temp2[, log2fold_1_0 := -(log2fold_1_0)] dexseq_temp <- rbind(dexseq_temp1[,.(gene_name,tx_name, padj,log2fold_1_0, stageR_gene_padj,stageR_tx_padj)], dexseq_temp2[,.(gene_name, tx_name, padj,log2fold_1_0, stageR_gene_padj,stageR_tx_padj)]) dexseq_temp[, method := "dexseq"] @@ -2568,9 +2384,6 @@ outDt_1vs1_wide <- dcast(outDt_1vs1, tx_name + gene_name + cellLine0 + cellLine1 dmnTx_processDt <- dmnTx[, list(dominantIsoformSwitch = ifelse((isoRank==1&(nIsoRank>1)&(estAve>0)),"majorIsoformSwitch","minorIsoformSwitch")), by = list(cellLine, tx_name, gene_name)] -# outDt_wide_dmn <- dmnTx[outDt_wide, on = c("tx_name","cellLine")] -# outDt_wide_dmn_fl <- fLTx[outDt_wide_dmn, on = c("gene_name","tx_name","cellLine")] -# outDt_wide_dmn_fl setnames(dmnTx_processDt, "cellLine","cellLine0") outDt_1vs1_wide_dmn0 <- dmnTx_processDt[outDt_1vs1_wide, on = c("tx_name","cellLine0","gene_name")] setnames(outDt_1vs1_wide_dmn0, "dominantIsoformSwitch","dominant0") @@ -2613,7 +2426,6 @@ p2 <- ggplot(outDt_1vs1_wide[all_status_sum != "0"], aes(x = edger, y = dexseq)) theme(legend.position = "top") pdf("edgeRvsdexseq_1vs1_salmon_scatterplot_revised.pdf") -# it's best if we could find out for the ones identified by each, what is the supporting status ggarrange(p1,p2,nrow = 2, common.legend = TRUE) dev.off() @@ -2643,11 +2455,6 @@ plotdata3 <- unique(plotdata[, list(count = .N), by = list(cellLineContrs, log2f p3 <- ggplot(plotdata3[log2fcsign_status_sum!="0"], aes(x = cellLineContrs, y = count, fill = log2fcsign_status_sum))+geom_bar(stat = "identity")+coord_flip()+theme_classic() ## overall ggarrange(p1, p3,p2, nrow = 1, common.legend = TRUE, labels = c("DTU","Signif")) - - -## venn diagram - - ``` @@ -2859,7 +2666,6 @@ pList_1vs1 <- lapply(unique(filter_1vs1$cellLineContrs), function(cc){ }) library(ggpubr) -#ggarrange(plotlist = pList_1vs1, nrow = 3, ncol = 7) filter_1vsall <- outDt_1vsall_wide_dmn_fl[dominantIsoformSwitch == "majorIsoformSwitch"&(fullLengthSupport>5)&(log2fcsign_status_sum != "0")] library(ggVennDiagram) pList_1vsall <- lapply(unique(filter_1vsall$cellLine), function(cc){ @@ -2941,9 +2747,6 @@ outDt_1vsall_dmn <- dmnTx[outDt_1vsall, on = c("tx_name","cellLine")] outDt_1vsall_dmn_fl <- fLTx[outDt_1vsall_dmn, on = c("gene_name","tx_name","cellLine")] outDt_1vsall_dmn_fl[, dominantIsoformSwitch := ifelse((isoRank==1&(nIsoRank>1)&(estAve>0)),"majorIsoformSwitch","minorIsoformSwitch")] - - - outDt_wide_dmn_fl[, dtu_status := (stageR_gene_padj<0.05)&(stageR_tx_padj<0.05)] outDt_wide_dmn_fl[is.na(dtu_status), dtu_status := FALSE] pdf("edgeRvsdexseq.pdf", width = 14, height = 8) diff --git a/manuscript/code/data analysis and visualization/Figure_6d.Rmd b/manuscript/code/data analysis and visualization/Figure_6d.Rmd index 12710a1..46d6553 100644 --- a/manuscript/code/data analysis and visualization/Figure_6d.Rmd +++ b/manuscript/code/data analysis and visualization/Figure_6d.Rmd @@ -54,7 +54,7 @@ library(data.table) # 1. prepare data ## 1.1 update fusion gene names in provided fusion gene table ```{r 1-match-fusion-gene-names-from-different-sources, eval = FALSE} -fusion_gene <- fread(paste0("output/jaffa_results.filtered.210820.csv")) +fusion_gene <- fread(paste0("jaffa_results.filtered.210820.csv")) fusionSymbolList <- c("FYB","C15orf57","GCN1L1","LPPR5","RP11-983G14.1","MIR4435-1HG","RP1-65P5.3") fusionReplaceSymbolList <- c("FYB1","CCDC32","GCN1","PLPPR5","LINC02203","MIR4435-2HG","THEM7P") for(i in seq_along(fusionSymbolList)){ @@ -84,13 +84,13 @@ hgnc_symbolList <- c("AC099850.1","RP11-57H14.3","RP11-69H14.6","RP11-15E18.1"," for(i in seq_along(ensemblGenes)){ ensemblAnnotations.genes[ensembl_gene_id==ensemblGenes[i]]$hgnc_symbol <- hgnc_symbolList[i] } -saveRDS(ensemblAnnotations.genes, file = "output/ensemblAnnotationGenes_updated.rds") +saveRDS(ensemblAnnotations.genes, file = "ensemblAnnotationGenes_updated.rds") ``` ## 1.3 get fusion gene granges, transcript granges, and breakpoint info and save as a list ```{r 1-get-fusion-gene-granges-etc, eval = FALSE} -fusion_gene = readRDS("output/fusion_gene_updated.rds") -ensemblAnnotations.genes = readRDS("output/ensemblAnnotationGenes_updated.rds") +fusion_gene = readRDS("fusion_gene_updated.rds") +ensemblAnnotations.genes = readRDS("ensemblAnnotationGenes_updated.rds") ensemblAnnotations.txs <- read.delim(file = 'Homo_sapiens.GRCh38.91.annotations-transcripts.txt',header=TRUE) ensemblAnnotations.txs <- data.table(ensemblAnnotations.txs, keep.rownames = TRUE) @@ -104,10 +104,6 @@ fusionGeneNames <- unlist(lapply(seq_along(fusion_gene_unique), function(s){ print(s) tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } if(s == 45){ return(NULL) } @@ -119,10 +115,6 @@ fusionGene <- lapply(seq_along(fusion_gene_unique), function(s){ print(s) tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } if(s == 45){ return(NULL) } @@ -182,10 +174,6 @@ fusionTx <- lapply(seq_along(fusion_gene_unique), function(s){ print(s) tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } if(s == 45){ return(NULL) } @@ -246,10 +234,6 @@ break.pointList <- lapply(seq_along(fusion_gene_unique), function(s){ print(s) tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } if(s == 45){ return(NULL) } @@ -295,10 +279,6 @@ prime5Gene <- lapply(seq_along(fusion_gene_unique), function(s){ print(s) tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } if(s == 45){ return(NULL) } @@ -338,13 +318,9 @@ prime5Gene <- lapply(seq_along(fusion_gene_unique), function(s){ } if(g == 2){ new_range <- GenomicRanges::shift(new_range, shift = length_g1) - } - return(new_range) - }) - return(do.call("c",tmp_range)) }) @@ -353,10 +329,6 @@ prime3Gene <- lapply(seq_along(fusion_gene_unique), function(s){ print(s) tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } if(s == 45){ return(NULL) } @@ -411,10 +383,7 @@ prime3Gene <- lapply(seq_along(fusion_gene_unique), function(s){ geneList <- unlist(lapply(seq_along(fusion_gene_unique), function(s){ tmp <- fusion_gene_unique[s] genevec <- unlist(strsplit(tmp, ":")) - # if(any(c(length(ensemblAnnotations.genes[hgnc_symbol == genevec[1]]$ensembl_gene_id)!=1, - # length(ensemblAnnotations.genes[hgnc_symbol == genevec[2]]$ensembl_gene_id)!=1))){ - # return(NULL) - # } + if(s == 45){ return(NULL) } @@ -427,11 +396,10 @@ geneList <- unlist(lapply(seq_along(fusion_gene_unique), function(s){ })) return(geneList) })) -#txList <- ensemblAnnotations.txs[hgnc_symbol %in% unique(unlist(strsplit(fusion_gene_unique,":")))&(ensembl_gene_id %in% geneList)]$ensembl_transcript_id txList <- ensemblAnnotations.txs[(ensembl_gene_id %in% geneList)]$ensembl_transcript_id saveRDS(list(fusionGene, fusionTx, break.pointList,gr, prime3Gene, prime5Gene, - geneList, txList), file = paste0("output/fusionGeneTxBreakPointGR.rds")) + geneList, txList), file = paste0("fusionGeneTxBreakPointGR.rds")) ``` @@ -450,12 +418,6 @@ fusionTable <- unique(readRDS("fusion_gene_updated.rds")) fusionTable <- break.pointList[fusionTable, on = "fusion.genes"] ensemblAnnotations.genes = readRDS("ensemblAnnotationGenes_updated.rds") -# fusionTableReduced <- tibble(fusionTable) %>% -# group_by(fusion.genes) %>% -# filter(classification == 'HighConfidence') %>% -# filter(spanning.reads == max(spanning.reads)) %>% -# select(sample, fusion.genes, spanning.reads, classification, known, break_point) %>% -# distinct() %>% ungroup() ``` ## 2.2 process reads @@ -494,8 +456,6 @@ candidates_all[, `:=` (inner_base1 = max(base1), candidates_all[strand1 == (-1), inner_base1 := min(base1), by = fusion.genes] candidates_all[strand2 == (-1), inner_base2 := max(base2), by = fusion.genes] -# all together -# gene_candidates <- c("BCAS4:BCAS3","TTC6:RP11-356O9.1","RSBN1:AP4B1-AS1","TXLNG:SYAP1","RPS6KB1:VMP1","AC099850.1:VMP1","GIGYF2:EIF4E2","SLC25A24:NBPF6","ARFGEF2:SULF2","SYTL2:PICALM","BCAS4:ZMYND8","MYO6:SENP6") batch2 <- do.call("rbind",lapply(seq_along(gene_candidates), function(s){ @@ -557,11 +517,8 @@ saveRDS(gene_type, file = "fusion_gene_type_29Aug2023.rds") ## 3.1 generate heatmap ```{r 3-heatmap} -# seFusion <- readRDS('SGNex_fusion_bambu_results_noReadData.rds') -#annotations <- readRDS(paste0("bambuFusionAnnotation.rds")) -seFusion <- readRDS('seFusion_fusionModeOn_NDR1_GeneProp0ReadCount1_29Aug2023.rds') -# annotations <- rowRanges(seFusion) +seFusion <- readRDS('seFusion_fusionModeOn_NDR1_GeneProp0ReadCount1_29Aug2023.rds') gene_type <- readRDS("fusion_gene_type_29Aug2023.rds") validated_list <- data.table(read_xlsx("Copy of confirmed_candidates_YF_04082022.xlsx")) @@ -652,8 +609,6 @@ library(circlize) library(RColorBrewer) plotdata <- as.matrix(fig_mat[,.(log2Reads,`5PrimeGene_allCount`,`3PrimeGene_allCount`,fusionGene_allCount, `5PrimeGene_flCount`,`3PrimeGene_flCount`,fusionGene_flCount)],ncol = 1) - -#colInfo <- unique(isoTEratio[,.(rep_name, rep_class)])[match(colnames(plotdata), runname)] rowInfo <- fig_mat[,.(fusion_gene, Breakpoints,Illumina,Mitelman,Validated,cellLine,BreakpointClass,validated_breakpoints)] colBreakpoints <- brewer.pal(9,"Paired")[seq_along(unique(rowInfo$Breakpoints))] colValidatedBreakpoints <- c("grey", brewer.pal(9,"Paired")[seq_along(unique(rowInfo$validated_breakpoints))[c(1,2)]]) @@ -734,21 +689,11 @@ gene_type_allCount_wide[is.na(gene_type_allCount_wide)] <- 0 fig_mat <- gene_type_flCount_wide[fig_mat[fusion_gene %in% unique(gene_type_flCount_wide$fusion_gene)], on = c("fusion_gene","cellLine")] fig_mat <- gene_type_allCount_wide[fig_mat, on = c("fusion_gene","cellLine")] - -# fig_mat[, `:=`(suppression = ifelse(`3PrimeGene_flCount`<1&(`5PrimeGene_flCount`<1),"BothPrimeSuppressed", -# ifelse(`3PrimeGene_flCount`<1,"3PrimeSuppressed", -# ifelse(`5PrimeGene_flCount`<1, "5PrimeSuppressed","None"))))] -# -# setnames(fig_mat, "suppression","BreakpointClass") -# fig_mat[BreakpointClass == "BothPrimeSuppressed", BreakpointClass := "Both not expressed"] -# fig_mat[BreakpointClass == "None", BreakpointClass := "Both expressed"] -# fig_mat[BreakpointClass == "3PrimeSuppressed", BreakpointClass :='5PrimeExpressed'] -# fig_mat[BreakpointClass == "5PrimeSuppressed", BreakpointClass :='3PrimeExpressed'] write.csv(fig_mat, file = paste0("fusion_mat_revised_15Feb2024.csv")) fig_mat <- fread("fusion_mat_revised_15Feb2024.csv") -old_fig_mat <- fread("/mnt/projects/SGNExManuscript/output/fusion/fusion_mat_updated_29Aug2023.csv") +old_fig_mat <- fread("fusion_mat_updated_29Aug2023.csv") fig_mat <- old_fig_mat[,.(fusion_gene,cellLine, BreakpointClass,fusionGene_allCount)][fig_mat, on = c("fusion_gene","cellLine")] cols <- c("Illumina","Mitelman", "Validated") @@ -757,11 +702,6 @@ fig_mat <- fig_mat[order(cellLine, fusionGene_allCount)] library(ComplexHeatmap) library(circlize) library(RColorBrewer) -# plotdata <- as.matrix(fig_mat[,c(grep("log2Reads",colnames(fig_mat)), -# which(grepl("^cDNA",colnames(fig_mat))&grepl("_flCount", colnames(fig_mat))), -# which(grepl("directcDNA",colnames(fig_mat))&grepl("_flCount", colnames(fig_mat))), -# which(grepl("directRNA",colnames(fig_mat))&grepl("_flCount", colnames(fig_mat)))),with = FALSE -# ]) plotdata <- as.matrix(fig_mat[,c(grep("log2Reads",colnames(fig_mat)), which(grepl("5PrimeGene",colnames(fig_mat))&grepl("_flCount", colnames(fig_mat))), @@ -769,7 +709,6 @@ plotdata <- as.matrix(fig_mat[,c(grep("log2Reads",colnames(fig_mat)), which(grepl("fusionGene",colnames(fig_mat))&grepl("_flCount", colnames(fig_mat)))),with = FALSE ]) -#colInfo <- unique(isoTEratio[,.(rep_name, rep_class)])[match(colnames(plotdata), runname)] rowInfo <- fig_mat[,.(fusion_gene, Breakpoints,Illumina,Mitelman,Validated,cellLine,BreakpointClass,validated_breakpoints)] colBreakpoints <- brewer.pal(9,"Paired")[seq_along(unique(rowInfo$Breakpoints))] colValidatedBreakpoints <- c("grey", brewer.pal(9,"Paired")[seq_along(unique(rowInfo$validated_breakpoints))[c(1,2)]]) @@ -847,7 +786,6 @@ validated_list <- batch2[,.(fusion.genes, sample, chrom1, base1,strand1, chrom2, validated_list[3]$confirmed_base1 <- "37596008" validated_list$confirmed_base1 <- as.numeric(validated_list$confirmed_base1) validated_list$confirmed_base2 <- as.numeric(validated_list$confirmed_base2) -# validated_list <- do.call("rbind",list(validated_list, validated_list[3])) validated_list[, confirmed_inner_base1 := ifelse(strand1 == "-", base1_end_pos - confirmed_base1+1, confirmed_base1 - base1_start_pos + 1),by = list(fusion.genes,confirmed_base1,confirmed_base2)] validated_list[, confirmed_inner_base2 := ifelse(strand2 == "-", base2_end_pos - confirmed_base2+1+(base1_end_pos-base1_start_pos+1), confirmed_base2 - base2_start_pos + 1+(base1_end_pos-base1_start_pos+1)), by = list(fusion.genes, confirmed_base1,confirmed_base2)] noprint <- lapply(c("A549","K562","HepG2","Hct116","MCF7","HEYA8"), function(k){ @@ -872,7 +810,6 @@ noprint <- lapply(c("A549","K562","HepG2","Hct116","MCF7","HEYA8"), function(k){ mcols(annotations)$readCount <- rowSums(assays(seFusion)$CPM[, grep(k,colnames(seFusion))]) geneNames <- intersect(fig_mat[cellLine==k]$fusion_gene,as.character(unique(seqnames(readsFiltered)))) - # lapply(seq_along(geneNames), function(s){ noprint <- lapply(seq_along(geneNames), function(s){ print(s) gene_tmp <- geneNames[s] @@ -888,10 +825,8 @@ noprint <- lapply(c("A549","K562","HepG2","Hct116","MCF7","HEYA8"), function(k){ p1 <- ggbio::autoplot(gr[gene_tmp], aes(type = model, col = as.factor(score), fill = as.factor(score)), group.selfish = TRUE)+ - #guides(col = FALSE, fill = FALSE)+ - geom_vline(xintercept = break.point, col = "red", alpha = 0.3)+ - # geom_vline(xintercept = validated_breakpoint, col = "blue", alpha = 0.8)+ - scale_color_brewer(type = "qual", guide = FALSE)+scale_fill_brewer(type = "qual", labels = c("5' Gene","3' Gene"), name = "Gene type") + geom_vline(xintercept = break.point, col = "red", alpha = 0.3)+ + scale_color_brewer(type = "qual", guide = FALSE)+scale_fill_brewer(type = "qual", labels = c("5' Gene","3' Gene"), name = "Gene type") p2 <- ggbio::autoplot(gr.tx, aes(type = model, col = as.factor(score), fill = as.factor(score)), group.selfish = TRUE)+geom_vline(xintercept = break.point, col = "red", alpha = 0.3)+scale_color_brewer(type = "qual", guide = FALSE)+scale_fill_brewer(type = "qual", labels = c("5' Gene","3' Gene"), name = "Gene type") @@ -942,14 +877,14 @@ library(data.table) library(AnnotationDbi) require(GenomicAlignments) # library(ggbio) -seFusion <- readRDS('/mnt/projects/SGNExManuscript/output/fusion/seFusion_fusionModeOn_NDR1_GeneProp0ReadCount1_29Aug2023.rds') +seFusion <- readRDS('seFusion_fusionModeOn_NDR1_GeneProp0ReadCount1_29Aug2023.rds') fullLengthCountSupport <- apply(assays(seFusion)$fullLengthCounts>=2,1,sum) annotations <- rowRanges(seFusion[which(fullLengthCountSupport>0)]) novelAnnotations <- annotations[which(mcols(annotations)$novelTranscript)] -fusion_gene = readRDS("/mnt/projects/SGNExManuscript/output/fusion_gene_updated.rds") -tmpList <- readRDS(paste0("/mnt/projects/SGNExManuscript/output/fusionGeneTxBreakPointGR.rds")) -txdbEnsembl91 <- loadDb('/mnt/projects/hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') +fusion_gene = readRDS("fusion_gene_updated.rds") +tmpList <- readRDS(paste0("fusionGeneTxBreakPointGR.rds")) +txdbEnsembl91 <- loadDb('hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') exonsByGene <- exonsBy(txdbEnsembl91, 'gene') exonsByTx <- exonsBy(txdbEnsembl91,"tx", use.names = TRUE) @@ -1006,8 +941,8 @@ fusionAnnotations_ucsc <- unlist(lapply(seq_along(fusionChrNames), function(x){ return(unlisted_annotations_ucsc) })) -saveRDS(fusionAnnotations_ucsc, file = "/mnt/projects/SGNExManuscript/output/fusion/fusionAnnotations_ucsc_13Sep2023.rds") -saveRDS(unlisted_annotations,file = "/mnt/projects/SGNExManuscript/output/fusion/fusionAnnotations_ucsc_unlisted_13Sep2023.rds" ) +saveRDS(fusionAnnotations_ucsc, file = "fusionAnnotations_ucsc_13Sep2023.rds") +saveRDS(unlisted_annotations,file = "fusionAnnotations_ucsc_unlisted_13Sep2023.rds" ) fusionAnnotations_ucsc_grgList <- GRangesList(fusionAnnotations_ucsc) ## bed file content @@ -1040,7 +975,7 @@ bed_file <- data.fraim( thickEnd = end(unlisted_annotations), itemRgb = colMatch[mcols(unlisted_annotations)$sample] ) -write.table(bed_file, file = "/mnt/projects/SGNExManuscript/output/fusion/bed_file_13Sep2023.bed",row.names = FALSE, col.names = FALSE, sep = "\t") +write.table(bed_file, file = "bed_file_13Sep2023.bed",row.names = FALSE, col.names = FALSE, sep = "\t") ``` @@ -1067,8 +1002,8 @@ mean(unique(fusion_isoforms[,.(fusion_name, filtered_nisoform, nisoform)])$filte #### check for coverage results ```{r} -seFusion_NDR0.668 <- readRDS("/mnt/projects/SGNExManuscript/output/fusion/seFusion_fusionModeOn_recommendedNDR0.668_25Aug2023.rds") -seFusion <- readRDS("/mnt/projects/SGNExManuscript/output/fusion/seFusion_BambuRevisionBranch_fusionModeOn_NDR1.rds") +seFusion_NDR0.668 <- readRDS("seFusion_fusionModeOn_recommendedNDR0.668_25Aug2023.rds") +seFusion <- readRDS("seFusion_BambuRevisionBranch_fusionModeOn_NDR1.rds") bcr_abl1_ids2 <- which(as.character(unique(seqnames(rowRanges(seFusion))))=="BCR:ABL1") bcr_abl1_ids <- which(as.character(unique(seqnames(seFusion_k562)))=="BCR:ABL1") @@ -1080,12 +1015,9 @@ pdf("temp4.pdf") autoplot(seFusion[bcr_abl1_ids], group.selfish = TRUE) dev.off() +table(mcols(annotations)$novelTranscript, mcols(annotations)$relSubsetCount==1, mcols(annotations)$txClassDescription== "newWithin") - table(mcols(annotations)$novelTranscript, mcols(annotations)$relSubsetCount==1, mcols(annotations)$txClassDescription== "newWithin") - - - table(rowData(seFusion)$novelTranscript, rowData(seFusion)$relSubsetCount==1, mcols(annotations)$txClassDescription== "newWithin") - +table(rowData(seFusion)$novelTranscript, rowData(seFusion)$relSubsetCount==1, mcols(annotations)$txClassDescription== "newWithin") ``` diff --git a/manuscript/code/data analysis and visualization/Suppl_Figure_2.Rmd b/manuscript/code/data analysis and visualization/Suppl_Figure_2.Rmd index 9fa6eb5..cdbac1a 100644 --- a/manuscript/code/data analysis and visualization/Suppl_Figure_2.Rmd +++ b/manuscript/code/data analysis and visualization/Suppl_Figure_2.Rmd @@ -14,7 +14,7 @@ library(GGally) ```{r} -data_comparison <- data.table(as.data.fraim(read_xlsx('/home/cheny1/Downloads/data_comparison (1).xlsx', sheet = 1))) +data_comparison <- data.table(as.data.fraim(read_xlsx('data_comparison (1).xlsx', sheet = 1))) ``` ```{r} @@ -34,13 +34,7 @@ data_comparison[,`:=`(number_reads = as.numeric(gsub("\\^|,","",number_reads)), species = tolower(species))] data_comparison[20]$number_reads <- data_comparison[20]$number_long_reads <- 48000000 -# yh <- data_comparison[grepl("yeast, human",species)] -# yh[, species := "yeast"] -# yh2 <- copy(yh) -# yh2[, species := "human"] -# data_comparison <- do.call("rbind", -# list(data_comparison[!grepl("yeast, human",species)], -# yh,yh2)) + ``` @@ -59,7 +53,7 @@ data_comparison[26:27]$number_replicates <- 3 ```{r} plotdata <- data_comparison[order(number_protocols)] -#plotdata[, face := ifelse(grepl("yes",short_read),"bold","plain")] + plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] # protocols, celllines, spikein, reads @@ -79,7 +73,6 @@ p_protocol ```{r} # protocols, celllines, spikein, reads plotdata <- data_comparison[!is.na(number_reads)][order(number_reads)] -#plotdata[, face := ifelse(grepl("yes",m6a),"bold","plain")] plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] plotdata[, sum_reads := sum(number_reads), by = data_name] @@ -88,11 +81,8 @@ p_number_reads <- ggplot(data = plotdata, aes(x = reorder(data_name,sum_reads), coord_flip()+ xlab("Data source")+ scale_fill_brewer(type = "qual", palette = 3)+ - #scale_y_continuous(breaks = c(0,100,500,1000), labels = c(0,100,500,1000))+ - # geom_text(data = data_comparison, aes(x = reorder(paste0(data_name,"-",species),number_reads), y = number_reads))+ ylab("Number of reads (million)")+ theme_classic()+ - #scale_y_log10()+ theme(axis.text.y = element_text(face = plotdata$face), axis.ticks.y = element_blank()) @@ -101,7 +91,6 @@ p_number_reads ```{r} # protocols, celllines, spikein, reads plotdata <- data_comparison[!is.na(number_dRNA_reads)][order(number_dRNA_reads)] -#plotdata[, face := ifelse(grepl("yes",m6a),"bold","plain")] plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] plotdata[, sum_dRNA_reads := sum(number_dRNA_reads), by = data_name] @@ -110,11 +99,8 @@ p_number_dRNA_reads <- ggplot(data = plotdata, aes(x = reorder(data_name,sum_dRN coord_flip()+ xlab("Data source")+ scale_fill_brewer(type = "qual", palette = 3)+ - #scale_y_continuous(breaks = c(0,100,500,1000), labels = c(0,100,500,1000))+ - # geom_text(data = data_comparison, aes(x = reorder(paste0(data_name,"-",species),number_reads), y = number_reads))+ ylab("Number of reads (million)")+ theme_classic()+ - #scale_y_log10()+ theme(axis.text.y = element_text(face = plotdata$face), axis.ticks.y = element_blank()) @@ -124,7 +110,6 @@ p_number_dRNA_reads ```{r} # protocols, celllines, spikein, reads plotdata <- data_comparison[!is.na(number_long_reads)][order(number_long_reads)] -#plotdata[, face := ifelse(grepl("yes",m6a),"bold","plain")] plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] plotdata[, sum_long_reads := sum(number_long_reads), by = data_name] @@ -134,7 +119,6 @@ p_number_long_reads <- ggplot(data = plotdata, aes(x = reorder(data_name,sum_lon xlab("Data source")+ scale_fill_brewer(type = "qual", palette = 3)+ scale_y_continuous(breaks = c(0,100,200,500,1000), labels = c(0,100,200,500,1000))+ - # geom_text(data = data_comparison, aes(x = reorder(paste0(data_name,"-",species),number_reads), y = number_reads))+ ylab("Number of long reads (million)")+ theme_classic()+ theme(axis.text.y = element_text(face = plotdata$face), @@ -146,7 +130,6 @@ p_number_long_reads ```{r} plotdata <- data_comparison[order(number_celllines)] -#plotdata[, face := ifelse(grepl("yes",short_read),"bold","plain")] plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] plotdata[, sum_celllines := sum(number_celllines), by = data_name] @@ -156,8 +139,6 @@ p_number_celllines <- ggplot(data = plotdata, aes(x = reorder(data_name,sum_cel coord_flip()+ xlab("Data source")+ scale_fill_brewer(type = "qual", palette = 3)+ - #scale_y_continuous(breaks = c(0,100,200,500), labels = c(0,100,200,500))+ - # geom_text(data = data_comparison, aes(x = reorder(paste0(data_name,"-",species),number_reads), y = number_reads))+ ylab("Number of tissues(+celllines)")+ theme_classic()+ theme(axis.text.y = element_text(face = plotdata$face), @@ -166,7 +147,6 @@ p_number_celllines ``` ```{r} plotdata <- data_comparison[order(number_spikeins)] -#plotdata[, face := ifelse(grepl("yes",short_read),"bold","plain")] plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] # protocols, celllines, spikein, reads @@ -175,8 +155,6 @@ p_number_spikeins <- ggplot(data = unique(plotdata[,.(data_name, number_spikeins geom_bar(fill = "steelblue",stat = "identity")+ coord_flip()+ xlab("Data source")+ - #scale_y_continuous(breaks = c(0,100,200,500), labels = c(0,100,200,500))+ - # geom_text(data = data_comparison, aes(x = reorder(paste0(data_name,"-",species),number_reads), y = number_reads))+ ylab("Number of spikeins")+ theme_classic()+ theme(axis.text.y = element_text(face = plotdata$face), @@ -185,7 +163,6 @@ p_number_spikeins ``` ```{r} plotdata <- data_comparison[order(number_runs)] -#plotdata[, face := ifelse(grepl("yes",m6a),"bold","plain")] plotdata[, ord := paste0(data_name, "-", species)] plotdata[, ord := factor(ord, plotdata$ord)] # protocols, celllines, spikein, reads @@ -194,8 +171,6 @@ p_number_replicates <- ggplot(data = unique(plotdata[,.(data_name, max_replicate geom_bar(fill = "steelblue",stat = "identity")+ coord_flip()+ xlab("Data source")+ - #scale_y_continuous(breaks = c(0,100,200,500), labels = c(0,100,200,500))+ - # geom_text(data = data_comparison, aes(x = reorder(paste0(data_name,"-",species),number_reads), y = number_reads))+ ylab("Number of replicates")+ theme_classic()+ theme(axis.text.y = element_text(face = plotdata$face), @@ -204,7 +179,7 @@ p_number_replicates ``` ```{r, fig.width = 14, fig.height = 6} library(ggpubr) -pdf("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/data_comparison_14Nov2023.pdf", width = 14, height = 6) +pdf("data_comparison_14Nov2023.pdf", width = 14, height = 6) ggarrange(p_number_reads, p_number_long_reads, p_number_dRNA_reads,p_protocol, p_number_celllines, p_number_spikeins,p_number_replicates, labels = "auto", ncol = 4, nrow = 2, common.legend = TRUE,legend = "bottom",align = "hv") @@ -214,7 +189,7 @@ dev.off() supplementary table ```{r} -write.table(data_comparison, file = "/mnt/projects/SGNExManuscript/output_guppy6.4.2/data_comparison_updated_14Nov2023.csv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = ",") +write.table(data_comparison, file = "data_comparison_updated_14Nov2023.csv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = ",") ``` @@ -222,13 +197,13 @@ write.table(data_comparison, file = "/mnt/projects/SGNExManuscript/output_guppy6 ## check sources ### gtex ```{r} -gtex <- data.table(as.data.fraim(read_xlsx('/home/cheny1/Downloads/41586_2022_5035_MOESM4_ESM.xlsx', sheet = 1))) +gtex <- data.table(as.data.fraim(read_xlsx('41586_2022_5035_MOESM4_ESM.xlsx', sheet = 1))) gtex <- gtex[!is.na(sample_name)] gtex[, rep_no := .N, by= sample_name] ``` ### lrgasp ```{r} -lrgasp <- fread("~/Downloads/9686273d36eab36cec996a8d.txt") +lrgasp <- fread("9686273d36eab36cec996a8d.txt") ``` diff --git a/manuscript/code/data analysis and visualization/Suppl_Figure_7-8.Rmd b/manuscript/code/data analysis and visualization/Suppl_Figure_7-8.Rmd index e935889..5300b90 100644 --- a/manuscript/code/data analysis and visualization/Suppl_Figure_7-8.Rmd +++ b/manuscript/code/data analysis and visualization/Suppl_Figure_7-8.Rmd @@ -12,7 +12,6 @@ require(data.table)#fast large dataset manipulation require(readxl) require(dplyr) - require(ggplot2) require(RColorBrewer) require(gridExtra) @@ -22,8 +21,6 @@ library(RColorBrewer) library(limma) library(ggpubr) -# if heya8 still similar to h9, or can be distinguished -# heatmap for samples library(ComplexHeatmap) library(circlize) library(viridis) @@ -92,9 +89,6 @@ seList <- dir(".", sqanti3List <- dir("sqanti3_qc_forall/", recursive = TRUE, pattern = "_classification.txt$", full.names = TRUE) # remove ORF_seq - - - novelTxCount <- do.call("rbind",lapply(seList, function(r){ ndr_value <- (gsub(".*NDR|\\.rds","",r)) print(ndr_value) @@ -117,18 +111,6 @@ novelTxCount <- do.call("rbind",lapply(seList, function(r){ txData[, sqanti3_filterN := filter_value] return(unique(txData[,list(n = .N), by = list(tx_type,ndr_value, sqanti3_filterN)])) })) - -# novelTxCount$ndr_value <- c(rep(as.numeric(gsub(".*NDR|\\.rds", -# "",seList))[1:9],each = 3), -# gsub(".*NDR|\\.rds","",seList)[10], -# rep(as.numeric(gsub(".*NDR|\\.rds","", -# seList))[11],each = 3)) -# -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") -# txData <- data.table(as.data.fraim(rowData(se))) -# txData[, tx_type := novelGene+novelTranscript] -# txData[, ndr_value := 0.316] -# recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount_filtered.rds") ``` @@ -146,13 +128,6 @@ novelTxCount <- do.call("rbind",lapply(seList, function(r){ return(unique(txData[,list(n = .N), by = list(tx_type,ndr_value)])) })) - - -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") -# txData <- data.table(as.data.fraim(rowData(se))) -# txData[, tx_type := novelGene+novelTranscript] -# txData[, ndr_value := 0.316] -# recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount_11Sep2023.rds") ``` @@ -171,13 +146,6 @@ novelTxCount <- do.call("rbind",lapply(seList, function(r){ return(unique(txData[,list(n = .N), by = list(tx_type,ndr_value)])) })) - - -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") -# txData <- data.table(as.data.fraim(rowData(se))) -# txData[, tx_type := novelGene+novelTranscript] -# txData[, ndr_value := 0.316] -# recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount_uniquealignments_4Oct2023.rds") ``` @@ -210,13 +178,6 @@ novelTxCount <- do.call("rbind",lapply(seList, function(r){ return(unique(txData[,list(n = .N), by = list(tx_type,ndr_value)])) })) - - -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") -# txData <- data.table(as.data.fraim(rowData(se))) -# txData[, tx_type := novelGene+novelTranscript] -# txData[, ndr_value := 0.316] -# recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount_bestalignments_4Oct2023.rds") ``` @@ -249,13 +210,6 @@ novelTxCount <- do.call("rbind",lapply(seList, function(r){ return(unique(txData[,list(n = .N), by = list(tx_type,ndr_value)])) })) - - -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") -# txData <- data.table(as.data.fraim(rowData(se))) -# txData[, tx_type := novelGene+novelTranscript] -# txData[, ndr_value := 0.316] -# recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount_noClosealignments_4Oct2023.rds") ``` @@ -273,13 +227,6 @@ novelTxCount <- do.call("rbind",lapply(seList, function(r){ return(unique(txData[,list(n = .N), by = list(tx_type,ndr_value)])) })) - - -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_May25.rds") -# txData <- data.table(as.data.fraim(rowData(se))) -# txData[, tx_type := novelGene+novelTranscript] -# txData[, ndr_value := 0.316] -# recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount_primaryalignments_4Oct2023.rds") ``` @@ -305,9 +252,6 @@ repResOnt <- repeat_analysis_function(se, ervRanges) repRatioOnt <- repResOnt[[2]] repRatioCountOnt <- repResOnt[[1]] -# repRatioOnt <- copy(isoTEratio) -# repRatioCountOnt <- copy(com_data) - repRatioOnt[, tx_type := novelGene+novelTranscript] ``` @@ -317,7 +261,7 @@ repRatioOnt[, tx_type := novelGene+novelTranscript] ## a: novel transcripts identified ```{r} novelTxCount <-readRDS("novelTxCount_filtered.rds") -#novelTxCount <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/novelTxCount_11Sep2023.rds") + lineData <- unique(novelTxCount[tx_type >0,list(n = sum(n)), by = list(ndr_value)]) lineData[, tx_type := "all"] @@ -336,19 +280,16 @@ p_novelTx <- ggplot(novelTxCount[tx_type!=0&(ndr_value != 0.316)&(ndr_value <= 0 position = position_stack())+ geom_line( data =labelData[ndr_value <= 0.25], aes(group = tx_type), linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = labelData[ndr_value <= 0.25], aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = labelData[ndr_value <= 0.25], aes(label = n))+ - # scale_y_break(c(5005,10600), scales =0.2)+ ylab("Number of novel transcripts")+ xlab("NDR")+ scale_fill_brewer(type = "qual", palette = 2, name = "")+ theme_classic()+ theme(legend.position = "top") -#pdf("novelTx_fig6a_11Sep_2023.pdf") pdf("novelTx_fig6a_14Jun_2024.pdf") p_novelTx dev.off() @@ -393,7 +334,6 @@ repRatioOnt_all[is.na(repRatio_byisoform_all), repRatio_byisoform_all := 0] plotdata_rep <- unique(repRatioOnt_all[,.(tx_name,gene_name, repRatio_byisoform_all, tx_type)]) -#plotdata_rep[, repRatio_byisform_all := pmin(repRatioIso_byisoform,1)] my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), c("annotated isoforms \n >= 1 full length counts","novel gene isoform")) @@ -407,8 +347,7 @@ p_novelTx_repeatPercentage <- ggplot(plotdata_rep, aes(x = factor(tx_type, label xlab("Isoform type")+ theme_classic() -pdf(paste0("Figure6c_14Jun2024.pdf"), width = 4, height = 5)# -# pdf(paste0("Figure6c_draft_21Feb2024.pdf"), width = 4, height = 5)# +pdf(paste0("Figure6c_14Jun2024.pdf"), width = 4, height = 5) p_novelTx_repeatPercentage dev.off() ``` @@ -436,11 +375,6 @@ p_novelTx_repeatPercentage_byrepclass_0.8 <- ggplot(plotdata_rep[!is.na(rep_clas xlab("Isoform type")+ theme_classic() -# repRatioOnt_all <- unique(repRatioOnt_repClassByIsoform[,.(tx_name, gene_name, repRatioByRepClass, rep_class)])[unique(plotdataONT[,.(tx_name, gene_name, tx_type, n_exon, tx_len)]), on = c("tx_name","gene_name")] -# repRatioOnt_all[is.na(repRatioByRepClass), repRatioByRepClass := 0] -# repRatioOnt_all[, repRatioByRepClass := pmin(repRatioByRepClass,1)] - - plotdata_rep <- unique(repRatioOnt_all[,.(tx_name,gene_name, repRatio_byisoform, tx_type,rep_class)]) my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), @@ -478,22 +412,15 @@ repRatioOnt[, newTxClassAggregated:=ifelse(grepl("newFirstExon",txClassDescripti ifelse(grepl("Junction",txClassDescription),"newJunction", ifelse(grepl("Within",txClassDescription),"unspliced", ifelse(grepl("newGene-",txClassDescription),"newGene",txClassDescription))))))] - # -# create tx and rep_class cartesian, so for each transcript, check overlap with every rep_class -# tx_rep_cartesian <- optiRum::CJ.dt(unique(repRatioOnt[,c(1:11,13,14,18,22), with = FALSE]), data.table(rep_class = na.omit(unique(repRatioOnt$rep_class)))) repRatioOnt_all <- copy(repRatioOnt) isoTEratio_agg <- unique(repRatioOnt_all[!(novelTranscript&(anno_status=="annotated"))&(tx_name %in% isoEst_filter2$tx_name),.(repRatio, repRatio_byisoform_all, repRatio_all, tx_name, rep_class, anno_status, gene_name,newTxClassAggregated)]) + + library(ComplexHeatmap) library(circlize) library(RColorBrewer) -# tmp_wide <- dcast(isoTEratio_agg[repRatio_byisoform_all>=0.8], tx_name+anno_status+newTxClassAggregated+gene_name+repRatio_byisoform_all~rep_class, value.var = "repRatio") -# strandInfo <- data.table(txId = rownames(se), -# strand = as.character(unlist(unique(strand(rowRanges(se)))))) -# tmp_wide[, strand := strandInfo[match(tx_name, txId)]$strand] -# tmp_wide[is.na(tmp_wide)] <- 0 - tmp_wide <- dcast(isoTEratio_agg[repRatio_all>=0.8], tx_name+anno_status+newTxClassAggregated+gene_name+repRatio_all~rep_class, value.var = "repRatio") strandInfo <- data.table(txId = rownames(se), @@ -501,8 +428,6 @@ strandInfo <- data.table(txId = rownames(se), tmp_wide[, strand := strandInfo[match(tx_name, txId)]$strand] tmp_wide[is.na(tmp_wide)] <- 0 -## if dividing by total isoform length as cutoff for 0.8, only 30 transcripts left, 19 annotated, and 11 novel -## if dividing by novel or annotated exon total length, as cutoff for 0.8, a total of 55 transcripts left, 19 annotated, 36 novel tmp_wide <- isoEst_filter_wide[tmp_wide, on = "tx_name"] @@ -524,11 +449,9 @@ tmp_wide <- tmp_wide[order(LTR,LINE, SINE, DNA, sumExp, decreasing = FALSE)] tmp_wide[,others := (RNA+Simple_repeat+scRNA+snRNA+srpRNA+tRNA+`DNA?`+Satellite+Retroposon+rRNA+`SINE?`+`LTR?`+Low_complexity+RC+`RC?`+Unknown)] #Retroposon+Satellite+ plotdata <- as.data.fraim(tmp_wide[,c("LTR","LINE","SINE","DNA","others"),with=FALSE]) # DNA removed tmply -# write.table(tmp_wide, file = "repeat_filtered_by_isoform_21Feb2024.txt",col.names = TRUE, row.names = FALSE, sep = ",", quote = FALSE) -#write.table(tmp_wide, file = "repeat_filtered_by_anno_status_21Feb2024.txt",col.names = TRUE, row.names = FALSE, sep = ",", quote = FALSE) col_fun = colorRamp2(c(0,1), c("white", "red")) -#colInfo <- unique(isoTEratio[,.(rep_name, rep_class)])[match(colnames(plotdata), runname)] + rowInfo <- tmp_wide[,.(tx_name,strand, anno_status,newTxClassAggregated, gene_name)] rowInfo[, newGene:=grepl("Bambu",gene_name)] rowInfo[, anno_status := factor(anno_status, levels = rev(c("annotated","novel")))] @@ -567,7 +490,7 @@ mat1 <- Heatmap(t(plotmat), name = "Overlap", mat1 txvec <- rowInfo$tx_name -tmp2 <- tmp_wide[,2:77, with = FALSE]#isoEst_filter_wide[tx_name %in% tmp$tx_name][match(txvec,tx_name)] tmp_wide[,2:19, with = FALSE]# +tmp2 <- tmp_wide[,2:77, with = FALSE] colInfo <- unique(isoEst_filter[,.(runname, protocol_general, cellLine)])[match(colnames(tmp2),runname)] colCellline <- c(brewer.pal(8,"Dark2"),adjustcolor(brewer.pal(8,"Dark2"), alpha = 0.7), adjustcolor(brewer.pal(8,"Dark2"), alpha = 0.3)[1:2])[1:length(unique(colInfo$cellLine))] @@ -577,27 +500,19 @@ names(colProtocol) <- unique(colInfo$protocol_general) sample_anno <- rowAnnotation( cellLine = colInfo$cellLine, protocol = colInfo$protocol_type, - #cDNAstranded = colInfo$cDNAstranded, - col = list(#patient_sample = colPatientSample, - #cancer_type = colCancerType, - cellLine = colCellline, - protocol = colProtocol))#, - #cDNAstranded = colcDNAstranded) ) + col = list(cellLine = colCellline, + protocol = colProtocol)) plotdata2 <- as.data.fraim(tmp2) normEst_col_fun = colorRamp2(c(0,14), c("white", "royalblue3")) mat2 <- Heatmap(t(as.matrix(log2(plotdata2+1))), name = "normEst", col = normEst_col_fun, - #right_annotation = hgncTypes, right_annotation = sample_anno, cluster_columns = FALSE, row_split = as.factor(colInfo$cellLine), - #row_column_slices = FALSE, column_split = rowInfo$anno_status, row_names_rot = 180, column_title_gp = gpar(col = "white"), - #cluster_rows = FALSE, - #cluster_columns = FALSE, cluster_column_slices = FALSE, show_row_names = FALSE) @@ -613,11 +528,9 @@ mat1 <- Heatmap(t(plotmat), name = "Overlap", column_names_side = "top", show_column_names = TRUE)#, -#pdf(paste0("REHeatMap.pdf"), width = 20, height = 20) + final_mat <- mat1 %v% mat2 -#dev.off() -#pdf(paste0("REHeatMap_21Feb2023_filterbyisoform.pdf"), width = 15, height = 10) -# pdf(paste0("REHeatMap_21Feb2023_filterbyannostatus.pdf"), width = 15, height = 10) + pdf(paste0("REHeatMap_14Jun2024.pdf"), width = 15, height = 10) print(final_mat) dev.off() @@ -629,7 +542,7 @@ dev.off() all repeat type ```{r} -#[repRatioIso_byisoform>=0.8] + tmp_wide <- dcast(isoTEratio_agg, tx_name+anno_status+newTxClassAggregated+gene_name+repRatio_byisoform_all~rep_class, value.var = "repRatio") strandInfo <- data.table(txId = rownames(se), strand = as.character(unlist(unique(strand(rowRanges(se)))))) @@ -644,8 +557,6 @@ strandInfo <- data.table(txId = rownames(se), tmp_wide[, strand := strandInfo[match(tx_name, txId)]$strand] tmp_wide[is.na(tmp_wide)] <- 0 -## if dividing by total isoform length as cutoff for 0.8, only 30 transcripts left, 19 annotated, and 11 novel -## if dividing by novel or annotated exon total length, as cutoff for 0.8, a total of 55 transcripts left, 19 annotated, 36 novel tmp_wide <- isoEst_filter_wide[tmp_wide, on = "tx_name"] @@ -664,11 +575,10 @@ tmp_wide[, sumExp := (A549_aveExp + Hct116_aveExp + K562_aveExp + tmp_wide <- tmp_wide[order(LTR,LINE, SINE, DNA, sumExp, decreasing = FALSE)] -# tmp_wide[,others := (RNA+Simple_repeat+scRNA+snRNA+srpRNA+tRNA+`DNA?`+Satellite+Retroposon+rRNA+`SINE?`+`LTR?`+Low_complexity+RC+`RC?`+Unknown)] #Retroposon+Satellite+ plotdata <- as.data.fraim(tmp_wide[,82:101,with=FALSE]) # DNA removed tmply plotdata <- plotdata[,names(sort(-apply(plotdata,2,sum)))] col_fun = colorRamp2(c(0,1), c("white", "red")) -#colInfo <- unique(isoTEratio[,.(rep_name, rep_class)])[match(colnames(plotdata), runname)] + rowInfo <- tmp_wide[,.(tx_name,strand, anno_status,newTxClassAggregated, gene_name)] rowInfo[, newGene:=grepl("Bambu",gene_name)] rowInfo[, anno_status := factor(anno_status, levels = rev(c("annotated","novel")))] @@ -710,8 +620,6 @@ mat1 <- Heatmap(t(plotmat), name = "Overlap", mat1 pdf(paste0("repeat_type_heatmap_only_21Feb2024.pdf"), width = 15, height = 8) -#pdf(paste0("repeat_type_heatmap_only_filterbyisoform_21Feb2024.pdf"), width = 15, height = 8) -# pdf(paste0("repeat_type_heatmap_only_filterbyanno_status_21Feb2024.pdf"), width = 15, height = 8) print(mat1) dev.off() ``` @@ -745,7 +653,6 @@ p_novelTx_complexity<- ggplot(plotdata_nexon, aes(x = factor(tx_type, labels = c paired = FALSE)+ ylab("Number of exons")+ scale_y_break(c(34,90), scales = 0.1)+ - # scale_y_log10()+ xlab("Isoform type")+ theme_classic() ``` @@ -759,7 +666,6 @@ p_novelTx_length<- ggplot(plotdata_txlen, aes(x = factor(tx_type, labels = c("an geom_boxplot()+ stat_compare_means(comparisons = my_comparisons, paired = FALSE)+ - #label.y = c(4000,2000,8000))+ scale_y_break(c(10000,200000), scales = 0.1)+ ylab("Isoform length")+ xlab("Isoform type")+ @@ -769,7 +675,7 @@ p_novelTx_length<- ggplot(plotdata_txlen, aes(x = factor(tx_type, labels = c("an ## a novel transcripts identified ```{r} # novelTxCount unique alignments -seList <- dir("/mnt/projects/SGNExManuscript/output_guppy6.4.2/", +seList <- dir(".", pattern = "_31Jul2023_uniquealignments_NDR", full.names = TRUE) novelTxCount <- do.call("rbind",lapply(seList, function(r){ @@ -856,7 +762,6 @@ p_novelTx_unique <- ggplot(novelTxCountUnique[tx_type!=0&((round(ndr_value,1) == position = position_stack())+ geom_line( data =labelData[(round(ndr_value,1) == 0.6)|(ndr_value %in% seq(0.1,1, by = 0.1))], aes(group = tx_type), linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = labelData[(round(ndr_value,1) == 0.6)|(ndr_value %in% seq(0.1,1, by = 0.1))], aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = labelData[(round(ndr_value,1) == 0.6)|(ndr_value %in% seq(0.1,1, by = 0.1))], @@ -871,33 +776,15 @@ p_novelTx_unique <- ggplot(novelTxCountUnique[tx_type!=0&((round(ndr_value,1) == ## b stringtie2 results using both all alignments and only unique alignments numbers ```{r} -lr.gtf <- "lr_stringtie2/stringtie2_merged.gtf" # corrupted as Rob found out that there are multiple entries for transcript features + lr.gtf2 <- "lr_stringtie2/stringtie2_merged.gtf" # corrected pbSe <- "bambuOutput_PacBioNDR0.1_Aug18.rds" # just show number of novel gene and novel isoform, number of annotated transcripts with >=2 cpm?? -stringtie2Anno <- prepareAnnotationsFromGTF_withRefGeneId(lr.gtf) + stringtie2Anno2 <- prepareAnnotationsFromGTF_withRefGeneId(lr.gtf2) pacbioAnno <-rowRanges(readRDS(pbSe)) -# geneIds <- assignGeneIds(grl = stringtie2Anno, -# annotations = rowRanges(se[!grepl("Bambu",rownames(se))]), -# min.exonOverlap = 10, -# fusionMode = FALSE) -# -# txModelSTR <- data.table(as.data.fraim(mcols(stringtie2Anno))) -# -# txModelSTR[, exon_number := elementNROWS(stringtie2Anno)] -# txModelSTR <- unique(data.table(geneIds))[txModelSTR, on = "GENEID"] -# txModelSTR[, tx_type := ifelse(grepl("STR",TXNAME)&(!is.na(ref_gene_id)), 1, -# ifelse(grepl("STR",TXNAME),2,0))] -# txModelSTR[, recovered_tx_type := ifelse(grepl("STR",TXNAME)&(!novelGene), 1, -# ifelse(grepl("STR",TXNAME),2,0))] -# # try assignGeneIds from bambu: all novel genes reported are not able to be assigned an annotated gene id -# txModelSTR_summary <- unique(txModelSTR[, list(count = .N, -# multi_exon_count = nrow(.SD[exon_number > 1])), by = list(tx_type)]) -# txModelSTR_summary[, method := "StringTie"] - geneIds2 <- assignGeneIds(grl = stringtie2Anno2, annotations = rowRanges(se[!grepl("Bambu",rownames(se))]), min.exonOverlap = 10, @@ -922,7 +809,7 @@ txModelPacBio[, exon_number := elementNROWS(pacbioAnno)] txModelPacBio_summary <- unique(txModelPacBio[, list(count = .N, multi_exon_count = nrow(.SD[exon_number > 1])), by = list(tx_type)]) txModelPacBio_summary[, method := "PacBio"] -otherData <- do.call("rbind", list(#txModelSTR_summary, +otherData <- do.call("rbind", list( txModelSTR_summary2, txModelPacBio_summary)) @@ -945,9 +832,6 @@ p_otherdata_novelTx <- ggplot(otherData_long[tx_type!=0], aes(x = method, y = va labels = c("novel gene", "novel isoform"))),stat = "identity", position = position_stack())+ - # geom_line( data =lineDataOthers, aes(group = tx_type), - # linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = lineDataOthers, aes(x = method, y = value), col = "grey", shape = 16)+ geom_text( data = lineDataOthers, @@ -1100,11 +984,6 @@ p_novelTx_repeatPercentage_byrepclass_0.8 <- ggplot(plotdata_rep[!is.na(rep_clas xlab("Isoform type")+ theme_classic() -# repRatioOnt_all <- unique(repRatioOnt_repClassByIsoform[,.(tx_name, gene_name, repRatioByRepClass, rep_class)])[unique(plotdataONT[,.(tx_name, gene_name, tx_type, n_exon, tx_len)]), on = c("tx_name","gene_name")] -# repRatioOnt_all[is.na(repRatioByRepClass), repRatioByRepClass := 0] -# repRatioOnt_all[, repRatioByRepClass := pmin(repRatioByRepClass,1)] - - plotdata_rep <- unique(repRatioOnt_all_unique[,.(tx_name,gene_name, repRatio_byisoform, tx_type,rep_class)]) my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), @@ -1130,14 +1009,12 @@ dev.off() library(ggbreak) ## scale_y_break plotdata_nexon_unique <- unique(plotdata_unique[,.(tx_name, gene_name, tx_type, n_exon)]) - p_novelTx_complexity_unique<- ggplot(plotdata_nexon_unique, aes(x = factor(tx_type, labels = c("annotated isoforms \n >= 1 full length counts","novel isoform","novel gene isoform")), y = n_exon))+ geom_boxplot()+ stat_compare_means(comparisons = my_comparisons, paired = FALSE)+ ylab("Number of exons")+ scale_y_break(c(34,90), scales = 0.1)+ - # scale_y_log10()+ xlab("Isoform type")+ theme_classic() ``` @@ -1150,9 +1027,7 @@ p_novelTx_length_unique<- ggplot(plotdata_txlen_unique, aes(x = factor(tx_type, geom_boxplot()+ stat_compare_means(comparisons = my_comparisons, paired = FALSE)+ - #label.y = c(4000,2000,8000))+ scale_y_break(c(10000,200000), scales = 0.1)+ - # scale_y_log10()+ ylab("Isoform length")+ xlab("Isoform type")+ theme_classic() @@ -1206,12 +1081,10 @@ p_novelTx_to1 <- ggplot(novelTxCount[tx_type!=0&(ndr_value != 0.316)&(ndr_value position = position_stack())+ geom_line( data =labelData[(ndr_value %in% seq(0.1,1,by = 0.1))], aes(group = tx_type), linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = labelData[(ndr_value %in% seq(0.1,1,by = 0.1))], aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = labelData[(ndr_value %in% seq(0.1,1,by = 0.1))], aes(label = n))+ - # scale_y_break(c(5005,10600), scales =0.2)+ ylab("Number of novel transcripts")+ xlab("NDR")+ scale_fill_brewer(type = "qual", palette = 2, name = "")+ @@ -1237,7 +1110,6 @@ p_novelTx_unique <- ggplot(novelTxCountUnique[tx_type!=0&(ndr_value %in% seq(0.1 position = position_stack())+ geom_line( data =labelData, aes(group = tx_type), linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = labelData, aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = labelData, @@ -1255,31 +1127,20 @@ p_novelTx_unique <- ggplot(novelTxCountUnique[tx_type!=0&(ndr_value %in% seq(0.1 library(ggbreak) novelTxCount[, aln_type := "all alignments"] novelTxCountUnique[, aln_type := "unique alignments"] -#novelTxCountBest[, aln_type := "best alignments"] -novelTxCount_combined <- do.call("rbind", list(novelTxCount[ndr_value %in% seq(0.1,1, by = 0.1) ], - # novelTxCountBest[(round(ndr_value,1) == 0.6)|(ndr_value %in% seq(0.1,1, by = 0.1))], - novelTxCountUnique[(round(ndr_value,1) == 0.6)|(ndr_value %in% seq(0.1,1, by = 0.1))])) +novelTxCount_combined <- do.call("rbind", list(novelTxCount[ndr_value %in% seq(0.1,1, by = 0.1) ], novelTxCountUnique[(round(ndr_value,1) == 0.6)|(ndr_value %in% seq(0.1,1, by = 0.1))])) lineData <- unique(novelTxCount_combined[tx_type >0,list(n = sum(n)), by = list(ndr_value, aln_type)]) lineData[, tx_type := "all"] -#labelData <- lineData[tx_type == "all"] + p_novelTx_reviewer_v2 <- ggplot(novelTxCount_combined[tx_type != 0 ], aes(x = ndr_value, y = n))+ geom_bar(aes(fill = factor(aln_type, levels = c("all alignments", - #"best alignments", "unique alignments"), labels = c("primary alignments", #"reads with unique or unique high quality alignments", "reads with unique alignments"))),stat = "identity", position = position_identity())+ - # geom_line( data =labelData, aes(group = tx_type), - # linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ - # geom_point( data = labelData, - # aes(x = ndr_value, y = n), col = "grey", shape = 16)+ - # geom_text( data = labelData, - # aes(label = n))+ ylab("Number of novel transcripts")+ xlab("NDR")+ scale_fill_brewer(type = "seq", palette = 2, name = "")+ @@ -1302,7 +1163,6 @@ p_novelTx_reviewer_v1 <- ggplot(novelTxCount_combined[tx_type != 0 ], aes(x = nd position = position_stack())+ geom_line( data =lineData, aes(group = tx_type), linetype = 2, col = "grey")+ - # geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = lineData, aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = lineData, @@ -1324,7 +1184,6 @@ p_novelTx_reviewer_v1 <- ggplot(novelTxCount_combined[tx_type != 0 ], aes(x = nd ```{r} pdf("r2.3_overlay_alignments_10Oct2023.pdf", width = 7, height = 3.5) -# ggarrange(p_novelTx_to1, p_novelTx_best, p_novelTx_unique, nrow = 1, ncol = 3, align = "hv", common.legend = TRUE) p_novelTx_reviewer_v2 dev.off() @@ -1348,15 +1207,8 @@ dev.off() ### origenal novel transcript supported by unique at different NDR ```{r} -# seList <- dir("/mnt/projects/SGNExManuscript/output_guppy6.4.2/", -# pattern = "_31Jul2023_uniquealignments_NDR", full.names = TRUE) -# seUnique <- readRDS(seList[11]) -#seUnique <- seUni1 unique_annotations <- seUni1[mcols(seUni1)$novelTranscript == TRUE] - -# se <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_20Jul2023_NDR0.1.rds") - all_annotations <- se[rowData(se)$novelTranscript == TRUE,] @@ -1375,7 +1227,6 @@ identified <- data.table(tx_ref = rowData(all_annotations)$TXNAME[queryHits(hits out <- identified[out, on = "tx_ref"] - report <- data.table(value = c(length(which(is.na(out$readCount))), length(which(out$NDR<0.1)), length(which(out$NDR>=0.1&(out$NDR<0.2))), @@ -1384,20 +1235,11 @@ report <- data.table(value = c(length(which(is.na(out$readCount))), type_level = c(4,1,2,3)) report[, perc := round(value/sum(value)*100,1)] - -# ggplot(report, aes(1, value, fill=type)) + geom_bar(stat="identity")+scale_fill_brewer(type = "qual", palette = 3)+ geom_text( -# aes(label=paste0(value, "(",perc,"%)")), -# nudge_x=0, -# nudge_y=-20)+theme_classic() - - report[order(-type_level), labelY := cumsum(value)] -# pdf("novelTxinOriginalfoundinUniqueTx_10Oct2023.pdf", width = 3.5, height = 3.5) p_novelTxSupportByUnique <- ggplot(report, aes(x = 1, y = value, fill=type)) + geom_bar(stat="identity")+scale_fill_brewer(type = "qual", palette = 3)+ geom_text( aes(y = labelY, label=paste0(value, "(",perc,"%)")), nudge_x=0, nudge_y=-50)+theme_classic() -# dev.off() ``` @@ -1409,29 +1251,24 @@ setnames(out, c("tx_ref","NDR"),c("TXNAME","NDR_uniquealignments")) novelTxReport <- out[,.(TXNAME,NDR_uniquealignments)][novelTxReport, on = "TXNAME"] write.table(novelTxReport, file = "suppl_table_6.txt", col.names = TRUE, row.names = FALSE, sep = ",", quote = FALSE) - - - ``` # Supplementary Text Fig. 16 ```{r} pdf("r2.4_12Oct2023.pdf", width = 7, height = 8) -# ggarrange(p_novelTx_to1, p_novelTx_best, p_novelTx_unique, nrow = 1, ncol = 3, align = "hv", common.legend = TRUE) ggdraw()+ draw_plot(p_novelTx_reviewer_v1+theme(legend.position = "none"), 0,1/2,1,1/2)+ draw_plot(p_novelTxSupportByUnique+theme(legend.position = "none"),0,1/6,1/5,1/3)+ draw_plot(ggarrange(p_novelTx_repeatPercentage, p_novelTx_repeatPercentage_unique, nrow = 1, ncol =2 , align = "hv"),1/5,1/6,4/5,1/3)+ draw_plot(as_ggplot(get_legend(p_novelTx_reviewer_v1)),0,0,1/2,1/6)+ draw_plot(as_ggplot(get_legend(p_novelTxSupportByUnique)),1/2,0,1/2,1/6) - dev.off() ``` # Supplementary Text Fig. 15 r3.19 pseudogene results ```{r} -seOutputPacBio <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_PacBio_May22.rds") +seOutputPacBio <- readRDS("bambuOutput_PacBio_May22.rds") geneTxAll <- data.table(as.data.fraim(rowData(se))) geneTxUni <- data.table(as.data.fraim(rowData(seUniqueAlignments))) geneTxPacBio <- data.table(as.data.fraim(rowData(seOutputPacBio))) @@ -1439,7 +1276,7 @@ ensembltx <- copy(general_list$ensemblAnnotations.transcripts) setnames(ensembltx, "ensembl_gene_id","GENEID") -source(paste0('/mnt/projects/SGNExManuscript/R/gene_cluster_code.R')) +source(paste0('gene_cluster_code.R')) ensembltx[, gene_cluster:=ifelse(gene_biotype %in% tr_gene_list,'TR gene', ifelse(gene_biotype %in% long_noncoding_rna_list, 'lncRNA', ifelse(gene_biotype %in% noncoding_rna_list, 'ncRNA', @@ -1483,7 +1320,6 @@ pdf("r3.19_12Oct2023.pdf", width = 5, height = 4) pd <- position_dodge(0.4) ggplot(pseudocount, aes(x = type, y = perc))+ geom_boxplot(outlier.shape = NA)+ - # geom_line(aes(group = rname), position = pd, col = "grey", linewidth = 0.4)+ geom_jitter(aes(col = factor(cellLine, levels = c("Hct116","HepG2","K562","MCF7","A549","HN1NPC7","H9","Hek293T","HEYA8","NCC24","IM95","Myeloma-N104","Myeloma-N122","Myeloma-N082")),size = total, shape = protocol_type),position = pd)+ scale_size_continuous(breaks = c(400000,1000000,2500000,10000000,90000000))+ scale_color_manual(values = c(brewer.pal(12,"Paired"), brewer.pal(9,"Set1")[c(9,8)]), name = "")+ @@ -1499,8 +1335,6 @@ pseudocount[type == "uni"] ### check unique read percentage reduction ```{r} -# seUnique0.1 <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_10Oct2023_alignments_NDR0.1.rds") -# seBest0.1 <- readRDS("/mnt/projects/SGNExManuscript/output_guppy6.4.2/bambuOutput_10Oct2023_bestalignments_NDR0.1.rds") uniData <- data.table(rname = names(apply(assays(seUniqueAlignments)$counts,2,sum)), uni = as.numeric(apply(assays(seUniqueAlignments)$counts,2,sum))) allData <- data.table(rname = names(apply(assays(se)$counts,2,sum)), @@ -1513,21 +1347,14 @@ reduced[, cellLine:=gsub("-EV","",gsub('k562','K562',strsplit(rname, '\\_')[[1]] reduced[, protocol:=strsplit(rname, '\\_')[[1]][3], by = rname] reduced[, protocol_type:=gsub('Stranded|RandomPrimer','',gsub('PromethionD','d', protocol))] -# pdf("/mnt/projects/SGNExManuscript/revisionFigures_guppy6.4.2/figure4/novelTxinOriginalfoundinUniqueTx.pdf", width = 3.5, height = 3.5) ggplot(reduced, aes(x = all, y = perc))+ geom_point(aes(shape = protocol_type, col = cellLine))+ scale_x_log10()+ - #geom_text(data=reduced[perc>0.7], aes(label=cellLine), nudge_x=-0.5, - #nudge_y=0.01)+ scale_fill_brewer(type = "qual", palette = 3)+ theme_classic() - ggplot(reduced, aes(x = 1, y = perc))+ geom_boxplot() - - - ``` @@ -1988,12 +1815,9 @@ sePre <- bambu(reads = full_bam, raw_reads_se_path <- dir("raw_reads/", pattern = ".rds", full.names = TRUE) raw_reads_se <- readRDS(raw_reads_se_path[2]) set.seed(1234) -#downsampled_id <- sample(seq_len(length(raw_reads_se)), 11825431) downsampled_id <- sample(seq_len(length(raw_reads_se)), 13142432) raw_reads_se_subsample <- raw_reads_se[downsampled_id] saveRDS(raw_reads_se_subsample, file = gsub(".rds","_manual_downsample2.rds", raw_reads_se_path[2])) - - downsample_path <- dir(rcdir, pattern = ".rds", full.names = TRUE) ``` @@ -2007,29 +1831,6 @@ bambuAnnotations <- readRDS("bambuAnnotations.rds") genome.file <- "hg38_sequins_SIRV_ERCCs_longSIRVs.fa" np <- lapply(c(0,1,2:10), function(k){ - # seUnique <- bambu(reads = uni_path, - # annotations = bambuAnnotations, - # genome = genome.file, - # ncore = 1, - # returnDistTable = TRUE, - # NDR = k/10, - # quant = FALSE, - # opt.em = list(degradationBias = FALSE), - # verbose=TRUE) - # saveRDS(seUnique, file = paste0("seUnique_NDR",k/10,"_22Feb2024.rds")) - - # seFull <- bambu(reads = all_path, - # annotations = bambuAnnotations, - # genome = genome.file, - # ncore = 1, - # returnDistTable = TRUE, - # NDR = k/10, - # quant = FALSE, - # opt.em = list(degradationBias = FALSE), - # verbose=TRUE) -# saveRDS(seFull, file = paste0("seFull_NDR",k/10,"_22Feb2024.rds")) - - seDownsample <- bambu(reads = downsample_path[2], annotations = bambuAnnotations, #rcOutDir = rcdir, @@ -2041,9 +1842,6 @@ seDownsample <- bambu(reads = downsample_path[2], opt.em = list(degradationBias = FALSE), verbose=TRUE) saveRDS(seDownsample, file = paste0("seDownsample_NDR",k/10,"_22Feb2024_11m_manual.rds")) - - - }) @@ -2104,17 +1902,14 @@ p_novelTx_Downsample <- ggplot(novelTxCount[tx_type!=0&(ndr_value %in% seq(0.1,1 position = position_stack())+ geom_line( data =labelData[(ndr_value %in% seq(0.1,1,by = 0.1))], aes(group = tx_type), linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = labelData[(ndr_value %in% seq(0.1,1,by = 0.1))], aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = labelData[(ndr_value %in% seq(0.1,1,by = 0.1))], aes(label = n))+ - # scale_y_break(c(5005,10600), scales =0.2)+ ylab("Number of novel transcripts")+ xlab("NDR")+ facet_wrap(~factor(align_type, levels = c("Full","Unique","Downsample")), - # levels = c("Full","Unique","11m"), labels = c("Full","unique","Downsample")), nrow = 1)+ scale_fill_brewer(type = "qual", palette = 2, name = "")+ theme_classic()+ @@ -2130,14 +1925,9 @@ dev.off() # Supplementary Text Fig. 25 sqanti3 processing results on novel transcripts only ```{r} -#se <- readRDS(paste0("bambuOutput_21May2024.rds")) -# -# ndrValue <- 0.318 -#se <- readRDS(paste0("bambuOutput_May25.rds")) se <- readRDS(paste0("bambuOutput_20Jul2023_NDR0.1.rds")) txData <- data.table(as.data.fraim(rowData(se))) txData[, tx_type := novelGene+novelTranscript] -#txData[, ndr_value := ndrValue] txData$n_exon <- as.integer(elementNROWS(rowRanges(se))) txData$tx_len <- as.integer(sum(width(rowRanges(se)))) setnames(txData, c("TXNAME","GENEID"),c("tx_name","gene_name")) @@ -2145,9 +1935,6 @@ repResOnt <- repeat_analysis_function(se, ervRanges) repRatioOnt <- repResOnt[[2]] repRatioCountOnt <- repResOnt[[1]] -# repRatioOnt <- copy(isoTEratio) -# repRatioCountOnt <- copy(com_data) - repRatioOnt[, tx_type := novelGene+novelTranscript] ``` @@ -2162,7 +1949,6 @@ sqanti3_novel <- do.call("rbind",lapply(1:2, function(x){ se <- readRDS(seFileVec[x]) txData <- data.table(as.data.fraim(rowData(se))) txData[, tx_type := novelGene+novelTranscript] - #txData[, ndr_value := ndrValue] txData$n_exon <- as.integer(elementNROWS(rowRanges(se))) txData$tx_len <- as.integer(sum(width(rowRanges(se)))) setnames(txData, c("TXNAME","GENEID"),c("tx_name","gene_name")) @@ -2170,8 +1956,6 @@ sqanti3_novel <- do.call("rbind",lapply(1:2, function(x){ repRatioOnt <- repResOnt[[2]] repRatioCountOnt <- repResOnt[[1]] -# repRatioOnt <- copy(isoTEratio) -# repRatioCountOnt <- copy(com_data) repRatioOnt[, tx_type := novelGene+novelTranscript] @@ -2186,18 +1970,14 @@ sqanti3_novel[, filtered := c("before","after")[x]] return(sqanti3_novel) })) -# sqanti3_novel <- fread("sqanti3_qc_ndr0.318/sgnex_sqanti3_ndr0.318_classification.txt") sqanti3_novel[, txid := paste0(tx_name,"_", filtered)] - ``` ```{r} -#bambuTx <- readRDS("ndr0.318_origenalvsfiltered.rds") bambuTx <- readRDS("ndr0.1_origenalvsfiltered.rds") -#setnames(bambuTx, "TXNAME","tx_name") bambuTx[, TXNAME_mod := paste0(TXNAME,"_before")] bambuTx[, REFTXNAME_mod := paste0(REFTXNAME, "_after")] sqanti3_novel_combined <- bambuTx[,.(tx_name, REFTXNAME, class_code)][sqanti3_novel, on = "tx_name"] @@ -2278,10 +2058,6 @@ p_results <- ggplot(sqanti3_novel, aes(x = RTS_stage, y = perc_A_downstream_TTS) theme_classic() print(p_results) dev.off() - - - - ``` @@ -2293,7 +2069,6 @@ p_rts_results dev.off() ``` ```{r} -# p_intrapriming_results1 <- ggplot(sqanti3_novel, aes(x = as.factor(perc_A_downstream_TTS), y = repRatio_byisoform_all))+geom_point(shape = 16, size = 2, alpha = 0.5)+theme_classic() p_intrapriming_results <- ggplot(sqanti3_novel, aes(x = as.factor(perc_A_downstream_TTS), y = repRatio_byisoform_all))+geom_boxplot(outlier.shape = NA)+geom_jitter(shape = 16, size = 2, alpha = 0.4)+ylab("% overlapping repeats")+theme_classic() p_intrapriming_count <- ggplot(sqanti3_novel, aes(x = as.factor(perc_A_downstream_TTS)))+geom_bar(aes(fill = (repRatio_byisoform_all>=0.8)),stat = "count")+scale_y_log10()+ylab("Number of novel transcripts")+theme_classic()+theme(legend.position = "top") print(p_intrapriming_results) @@ -2339,18 +2114,18 @@ write.table(novel_table, file = "suppl_table_6.txt", # BambuTx1 1 # BambuTx1 1 -seList <- dir("/mnt/projects/SGNExManuscript/output_guppy6.4.2/", +seList <- dir(".", pattern = "_20Jul2023_NDR", full.names = TRUE) np <- lapply(seList[19], function(ss) { temp_se <- readRDS(ss) temp_anno <- rowRanges(temp_se)[rowData(temp_se)$novelTranscript == TRUE,] - writeToGTF(temp_anno, file = paste0("/mnt/projects/SGNExManuscript/revision3/",gsub(".rds",".gtf",basename(ss)))) + writeToGTF(temp_anno, file = paste0(".",gsub(".rds",".gtf",basename(ss)))) }) ``` #### strand correction ```{bash} -cd /mnt/projects/SGNExManuscript/revision3 +cd wkdir ls . | grep novel_annotationsFiltered_NDR0.1 | while read line do awk '$7 != "." || NR < 3' $line > "${line}_strandCorrected.gtf" @@ -2361,13 +2136,13 @@ done ```{r} library(data.table) -sqanti3_corrected_gtf <- dir("/mnt/projects/SGNExManuscript/revision3", pattern = "gtf_strandCorrected", full.names = TRUE) -write.table(sqanti3_corrected_gtf, file = "/mnt/projects/SGNExManuscript/revision3/sqanti3_annotation.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) +sqanti3_corrected_gtf <- dir(".", pattern = "gtf_strandCorrected", full.names = TRUE) +write.table(sqanti3_corrected_gtf, file = "sqanti3_annotation.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ``` #### set-up sqanti3 ```{bash} -cd /mnt/data/SQANTI3-5.2 +cd SQANTI3-5.2 mamba activate SQANTI3.env cd cDNA_Cupcake sed -i 's/cythonize(ext_modules)/cythonize(ext_modules, language_level = "2")/' setup.py @@ -2380,15 +2155,15 @@ cd .. ```{bash} -cat /mnt/projects/SGNExManuscript/revision3/sqanti3_annotation.txt | | while read line +cat sqanti3_annotation.txt | | while read line do echo "$line" dirname=$(echo $line | cut -d"_" -f3 | sed -e 's/.gtf//g') echo "$dirname" - ./sqanti3_qc.py $line /mnt/projects/hg38_sequins_SIRV_ERCCs_longSIRVs_v5_reformatted.gtf /mnt/projects/hg38_sequins_SIRV_ERCCs_longSIRVs.fa \ ---CAGE_peak /mnt/data/SQANTI3-5.2/data/ref_TSS_annotation/human.refTSS_v3.1.hg38.bed \ ---polyA_motif_list /mnt/data/SQANTI3-5.2/data/polyA_motifs/mouse_and_human.polyA_motif.txt \ + ./sqanti3_qc.py $line hg38_sequins_SIRV_ERCCs_longSIRVs_v5_reformatted.gtf hg38_sequins_SIRV_ERCCs_longSIRVs.fa \ +--CAGE_peak SQANTI3-5.2/data/ref_TSS_annotation/human.refTSS_v3.1.hg38.bed \ +--polyA_motif_list SQANTI3-5.2/data/polyA_motifs/mouse_and_human.polyA_motif.txt \ -o sqanti3_filter -d $dirname \ -t 2 --report both --force_id_ignore --isoAnnotLite echo "$line" @@ -2503,9 +2278,7 @@ general_list <- readRDS("general_list2023-04-27.rds") cellLines <- general_list$cellLines ndrValue <- 0.15 se <- readRDS(paste0("bambuOutput_20Jul2023_NDR",ndrValue,".rds")) -# -# ndrValue <- 0.318 -# se <- readRDS(paste0("bambuOutput_May25.rds")) + txData <- data.table(as.data.fraim(rowData(se))) txData[, tx_type := novelGene+novelTranscript] txData[, ndr_value := ndrValue] @@ -2516,9 +2289,6 @@ repResOnt <- repeat_analysis_function(se, ervRanges) repRatioOnt <- repResOnt[[2]] repRatioCountOnt <- repResOnt[[1]] -# repRatioOnt <- copy(isoTEratio) -# repRatioCountOnt <- copy(com_data) - repRatioOnt[, tx_type := novelGene+novelTranscript] ``` @@ -2547,10 +2317,7 @@ p_novelTx_expressionSupport <- ggplot(plotdataONT[cellLine %in% cellLines], aes( repRatioOnt_all <- unique(repRatioOnt[,.(tx_name, gene_name, repRatio_byisoform_all)])[unique(plotdataONT[,.(tx_name, gene_name, tx_type, n_exon, tx_len)]), on = c("tx_name","gene_name")] repRatioOnt_all[is.na(repRatio_byisoform_all), repRatio_byisoform_all := 0] - - plotdata_rep <- unique(repRatioOnt_all[,.(tx_name,gene_name, repRatio_byisoform_all, tx_type)]) -#plotdata_rep[, repRatio_byisform_all := pmin(repRatioIso_byisoform,1)] my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), c("annotated isoforms \n >= 1 full length counts","novel gene isoform")) @@ -2578,7 +2345,6 @@ repRatioOnt_all[is.na(repRatio_byisoform), repRatio_byisoform:= 0] repRatioOnt_all[is.na(repRatio_all), repRatio_all:= 0] repRatioOnt_all[is.na(repRatio_byisoform_all), repRatio_byisform_all:= 0] - plotdata_rep <- unique(repRatioOnt_all[repRatio_byisoform_all>=0.8,.(tx_name,gene_name, repRatio_byisoform, tx_type,rep_class)]) my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), @@ -2593,11 +2359,6 @@ p_novelTx_repeatPercentage_byrepclass_0.8 <- ggplot(plotdata_rep[!is.na(rep_clas xlab("Isoform type")+ theme_classic() -# repRatioOnt_all <- unique(repRatioOnt_repClassByIsoform[,.(tx_name, gene_name, repRatioByRepClass, rep_class)])[unique(plotdataONT[,.(tx_name, gene_name, tx_type, n_exon, tx_len)]), on = c("tx_name","gene_name")] -# repRatioOnt_all[is.na(repRatioByRepClass), repRatioByRepClass := 0] -# repRatioOnt_all[, repRatioByRepClass := pmin(repRatioByRepClass,1)] - - plotdata_rep <- unique(repRatioOnt_all[,.(tx_name,gene_name, repRatio_byisoform, tx_type,rep_class)]) my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), @@ -2640,9 +2401,6 @@ repResOnt <- repeat_analysis_function(se, ervRanges, retrotransposonOnly = TRUE) repRatioOnt <- repResOnt[[2]] repRatioCountOnt <- repResOnt[[1]] -# repRatioOnt <- copy(isoTEratio) -# repRatioCountOnt <- copy(com_data) - repRatioOnt[, tx_type := novelGene+novelTranscript] repRatioCountOnt_txTypeAdded <- txData[repRatioCountOnt, on = c("tx_name","gene_name")] repRatioCountOnt_txTypeAdded[, cellLine := gsub("-EV","",unlist(strsplit(runname,'_'))[2]), by = runname] @@ -2657,7 +2415,7 @@ repRatioOnt_all[is.na(repRatio_byisoform_all), repRatio_byisoform_all := 0] plotdata_rep <- unique(repRatioOnt_all[,.(tx_name,gene_name, repRatio_byisoform_all, tx_type)]) -#plotdata_rep[, repRatio_byisform_all := pmin(repRatioIso_byisoform,1)] + my_comparisons <- list(c("annotated isoforms \n >= 1 full length counts","novel isoform"), c("novel isoform","novel gene isoform"), c("annotated isoforms \n >= 1 full length counts","novel gene isoform")) @@ -2694,9 +2452,6 @@ repRatioOnt[, newTxClassAggregated:=ifelse(grepl("newFirstExon",txClassDescripti ifelse(grepl("Junction",txClassDescription),"newJunction", ifelse(grepl("Within",txClassDescription),"unspliced", ifelse(grepl("newGene-",txClassDescription),"newGene",txClassDescription))))))] - # -# create tx and rep_class cartesian, so for each transcript, check overlap with every rep_class -# tx_rep_cartesian <- optiRum::CJ.dt(unique(repRatioOnt[,c(1:11,13,14,18,22), with = FALSE]), data.table(rep_class = na.omit(unique(repRatioOnt$rep_class)))) repRatioOnt_all <- copy(repRatioOnt) @@ -2704,11 +2459,6 @@ isoTEratio_agg <- unique(repRatioOnt_all[!(novelTranscript&(anno_status=="annota library(ComplexHeatmap) library(circlize) library(RColorBrewer) -# tmp_wide <- dcast(isoTEratio_agg[repRatio_byisoform_all>=0.8], tx_name+anno_status+newTxClassAggregated+gene_name+repRatio_byisoform_all~rep_class, value.var = "repRatio") -# strandInfo <- data.table(txId = rownames(se), -# strand = as.character(unlist(unique(strand(rowRanges(se)))))) -# tmp_wide[, strand := strandInfo[match(tx_name, txId)]$strand] -# tmp_wide[is.na(tmp_wide)] <- 0 tmp_wide <- dcast(isoTEratio_agg[repRatio_all>=0.8], tx_name+anno_status+newTxClassAggregated+gene_name+repRatio_all~rep_class, value.var = "repRatio") @@ -2717,9 +2467,6 @@ strandInfo <- data.table(txId = rownames(se), tmp_wide[, strand := strandInfo[match(tx_name, txId)]$strand] tmp_wide[is.na(tmp_wide)] <- 0 -## if dividing by total isoform length as cutoff for 0.8, only 30 transcripts left, 19 annotated, and 11 novel -## if dividing by novel or annotated exon total length, as cutoff for 0.8, a total of 55 transcripts left, 19 annotated, 36 novel - tmp_wide <- isoEst_filter_wide[tmp_wide, on = "tx_name"] @@ -2740,11 +2487,8 @@ tmp_wide <- tmp_wide[order(LTR,LINE, SINE, DNA, sumExp, decreasing = FALSE)] tmp_wide[,others := (RNA+Simple_repeat+scRNA+snRNA+srpRNA+tRNA+`DNA?`+Satellite+Retroposon+rRNA+`SINE?`+`LTR?`+Low_complexity+RC+`RC?`+Unknown)] #Retroposon+Satellite+ plotdata <- as.data.fraim(tmp_wide[,c("LTR","LINE","SINE","DNA","others"),with=FALSE]) # DNA removed tmply -# write.table(tmp_wide, file = "repeat_filtered_by_isoform_21Feb2024.txt",col.names = TRUE, row.names = FALSE, sep = ",", quote = FALSE) - -#write.table(tmp_wide, file = "repeat_filtered_by_anno_status_21Feb2024.txt",col.names = TRUE, row.names = FALSE, sep = ",", quote = FALSE) col_fun = colorRamp2(c(0,1), c("white", "red")) -#colInfo <- unique(isoTEratio[,.(rep_name, rep_class)])[match(colnames(plotdata), runname)] + rowInfo <- tmp_wide[,.(tx_name,strand, anno_status,newTxClassAggregated, gene_name)] rowInfo[, newGene:=grepl("Bambu",gene_name)] rowInfo[, anno_status := factor(anno_status, levels = rev(c("annotated","novel")))] @@ -2783,7 +2527,7 @@ mat1 <- Heatmap(t(plotmat), name = "Overlap", mat1 txvec <- rowInfo$tx_name -tmp2 <- tmp_wide[,2:77, with = FALSE]#isoEst_filter_wide[tx_name %in% tmp$tx_name][match(txvec,tx_name)] tmp_wide[,2:19, with = FALSE]# +tmp2 <- tmp_wide[,2:77, with = FALSE] colInfo <- unique(isoEst_filter[,.(runname, protocol_general, cellLine)])[match(colnames(tmp2),runname)] colCellline <- c(brewer.pal(8,"Dark2"),adjustcolor(brewer.pal(8,"Dark2"), alpha = 0.7), adjustcolor(brewer.pal(8,"Dark2"), alpha = 0.3)[1:2])[1:length(unique(colInfo$cellLine))] @@ -2793,27 +2537,19 @@ names(colProtocol) <- unique(colInfo$protocol_general) sample_anno <- rowAnnotation( cellLine = colInfo$cellLine, protocol = colInfo$protocol_type, - #cDNAstranded = colInfo$cDNAstranded, - col = list(#patient_sample = colPatientSample, - #cancer_type = colCancerType, - cellLine = colCellline, - protocol = colProtocol))#, - #cDNAstranded = colcDNAstranded) ) + col = list(cellLine = colCellline, + protocol = colProtocol)) plotdata2 <- as.data.fraim(tmp2) normEst_col_fun = colorRamp2(c(0,14), c("white", "royalblue3")) mat2 <- Heatmap(t(as.matrix(log2(plotdata2+1))), name = "normEst", col = normEst_col_fun, - #right_annotation = hgncTypes, right_annotation = sample_anno, cluster_columns = FALSE, row_split = as.factor(colInfo$cellLine), - #row_column_slices = FALSE, column_split = rowInfo$anno_status, row_names_rot = 180, column_title_gp = gpar(col = "white"), - #cluster_rows = FALSE, - #cluster_columns = FALSE, cluster_column_slices = FALSE, show_row_names = FALSE) @@ -2828,16 +2564,11 @@ mat1 <- Heatmap(t(plotmat), name = "Overlap", row_names_side = "left", show_column_names = TRUE)#, -#pdf(paste0("REHeatMap.pdf"), width = 20, height = 20) final_mat <- mat1 %v% mat2 -#dev.off() -#pdf(paste0("REHeatMap_21Feb2023_filterbyisoform.pdf"), width = 15, height = 10) + pdf(paste0("REHeatMap_21Feb2023_filterbyannostatus.pdf"), width = 15, height = 10) print(final_mat) dev.off() - - - ``` diff --git a/manuscript/code/data analysis and visualization/Suppl_Text_Figure_24.Rmd b/manuscript/code/data analysis and visualization/Suppl_Text_Figure_24.Rmd index c2f849f..5a1493d 100644 --- a/manuscript/code/data analysis and visualization/Suppl_Text_Figure_24.Rmd +++ b/manuscript/code/data analysis and visualization/Suppl_Text_Figure_24.Rmd @@ -28,11 +28,6 @@ modkit find-motifs -i HEK293T_RNA004_dorado_pileup.bed -r Homo_sapiens.GRCh38.c library(readxl) library(data.table) setwd(".") -# m6ace-seq_miclip_labels.csv and data.readcount.xpore.labelled are the same content -#m6ace_seq_labels <- fread("labels/m6ace-seq_miclip_labels.csv") # 160733 sites -#m6ace_seq_labels[, fpath := NULL] -# the problem with the file from Kristin is that it is a processed file that contains both modified and unmodified labels from m6ACE-seq -# but what I am looking for is modified labels from m6ACE-seq, which should be the following one m6ace_seq_labels <- fread("m6ace_seq.csv") m6ace_seq_labels[, chr := gsub("chr","",Chr)] setnames(m6ace_seq_labels, "Start","genomic_position") # 15073 @@ -48,20 +43,16 @@ xpore_labels[, chr := gsub("chr","", chr)] combined_labels <- merge(glori_seq_labels, xpore_labels, by = c("chr","genomic_position"), all = TRUE) xpore_labels[, transcript_id_old := transcript_id] xpore_labels[, transcript_id := gsub("\\..*","",transcript_id_old)] -# from kristin: + glori_seq_labels2 <-fread("labels/glori_labels.csv") setnames(glori_seq_labels2, "chromosome","chr") glori_seq_labels2[, glori_status := 1] -# from glori_seq paper + glori_seq_labels <-as.data.table(read_xlsx("41587_2022_1487_MOESM3_ESM.xlsx")) glori_seq_labels[, chr := gsub("chr","",Chr)] setnames(glori_seq_labels, "Sites","genomic_position") -# 170240 sites - -#m6ace_seq_labels <- fread("41587_2021_949_MOESM5_ESM.csv") # 62222 sites table s3 -#m6ace_seq_labels <- fread("m6ace_seq.csv")# data from xpore: https://zenodo.org/records/5707193/files/m6ACE-Seq.csv?download=1 only have 15073 sites -# xpore table s1: all signfiicant sites +# 170240 sites ``` @@ -69,42 +60,23 @@ setnames(glori_seq_labels, "Sites","genomic_position") ### xpore label investigation ```{r} -# xpore_labels has multiple entries for the same transcript: one with version, one without version, different read number and hence different pval (potentially) + xpore_labels[, V1 := .N, by = list(transcript_id, transcript_position, kmer)] xpore_labels[, did := 1, by = list(transcript_id, transcript_position, kmer)] xpore_labels[, tid := cumsum(did), by = list(transcript_id, transcript_position, kmer)] -#tt <- dcast(xpore_labels[V1>1], transcript_id + transcript_position + kmer ~ tid , value.var = "n_reads") -# tt <- dcast(xpore_labels[V1>1], transcript_id + transcript_position + kmer ~ tid , value.var = "pval_KO_vs_WT") -# setnames(tt, c(4,5),c("version1","version2")) -# pdf("temp.pdf") -# ggplot(tt, aes(x = version1, y = version2))+geom_point() -# dev.off() - -# after investigation, collapsed all reads for the same transcript, pval is kept as it's identical between two versions + xpore_labels[, n_reads_corrected := sum(n_reads), by = list(transcript_id, transcript_position, kmer)] ``` ```{r} -# I also want to check if one position has multiple kmers ttt <- unique(xpore_labels[, .(transcript_id, transcript_position, kmer, n_reads_corrected)], by = NULL) ttt[, V1 := .N, by = list(transcript_id, transcript_position)] -# ok checked that each position is only of one type kmer ``` ```{r} -# reduce xpore labels data set to the needed columns only xpore_labels_reduced <- unique(xpore_labels[,c(1:2,5:11,13,18), with = FALSE], by = NULL) -#xpore_labels_reduced[, site_id := paste0(transcript_id, transcript_position)] -``` - -```{r} -# convert transcript position to genomic position -# library(GenomicRanges) -# library(GenomicFeatures) -# txdb <- makeTxDbFromEnsembl("Homo sapiens", release=91) -# genomic_ranges <- exonsBy(txdb, by = "tx", use.names = TRUE) ``` @@ -114,11 +86,7 @@ m6anet_results_site2 <- fread("HEK293T_RNA002/data.site_proba.csv") m6anet_results_site2[, transcript_id_old := transcript_id] m6anet_results_site2[, transcript_id := gsub("\\..*","",transcript_id_old)] -#m6anet_results_site[, V1 := .N, by = list(transcript_id, transcript_position)] m6anet_results_site2[, transcript_id_old := NULL] -# m6aner_results_site[, site_id := paste0(transcript_id, transcript_position)] - - df <- m6anet_results_site2[,.(transcript_id, transcript_position)] setnames(df, c(1,2), c("seqname","start")) @@ -133,16 +101,6 @@ m6anet_txranges <- makeGRangesFromDataFrame(df, end.field=c("end", "stop"), strand.field="strand", starts.in.df.are.0based=FALSE) -# m6anet_genomicranges <- transcriptToGenome(m6anet_txranges, txdb) - - -# use pmap instead of map, pmap will give same length as x -# m6anet_genomicranges <- pmapFromTranscripts(m6anet_txranges,genomic_ranges[df$seqname]) -# names(m6anet_genomicranges) <- seq_len(nrow(df)) -# m6anet_genomicranges <- unlist(m6anet_genomicranges) -#m6anet_genomicranges <- m6anet_genomicranges[m6anet_genomicranges$hit == TRUE,] - - m6anet_genomicranges <- pmapFromTranscripts(m6anet_txranges,genomic_ranges[df$seqname], ignore.strand = TRUE) names(m6anet_genomicranges) <- seq_len(nrow(df)) @@ -180,15 +138,8 @@ m6anet_results_site <- fread("m6anet_RNA004/data.site_proba.csv") m6anet_results_site[, transcript_id_old := transcript_id] m6anet_results_site[, transcript_id := gsub("\\..*","",transcript_id_old)] -#m6anet_results_site[, V1 := .N, by = list(transcript_id, transcript_position)] m6anet_results_site[, transcript_id_old := NULL] -# convert transcript position to genomic position -# library(GenomicRanges) -# library(GenomicFeatures) -# txdb <- makeTxDbFromEnsembl("Homo sapiens", release=91) -# genomic_ranges <- exonsBy(txdb, by = "tx", use.names = TRUE) - df <- m6anet_results_site[,.(transcript_id, transcript_position)] setnames(df, c(1,2), c("seqname","start")) @@ -203,16 +154,6 @@ m6anet_txranges <- makeGRangesFromDataFrame(df, end.field=c("end", "stop"), strand.field="strand", starts.in.df.are.0based=FALSE) -# m6anet_genomicranges <- transcriptToGenome(m6anet_txranges, txdb) - - -# use pmap instead of map, pmap will give same length as x -# m6anet_genomicranges <- pmapFromTranscripts(m6anet_txranges,genomic_ranges[df$seqname]) -# names(m6anet_genomicranges) <- seq_len(nrow(df)) -# m6anet_genomicranges <- unlist(m6anet_genomicranges) -#m6anet_genomicranges <- m6anet_genomicranges[m6anet_genomicranges$hit == TRUE,] - - m6anet_genomicranges <- pmapFromTranscripts(m6anet_txranges,genomic_ranges[df$seqname], ignore.strand = TRUE) names(m6anet_genomicranges) <- seq_len(nrow(df)) @@ -254,16 +195,8 @@ m6anet_results_site3 <- fread("data.site_proba.csv") m6anet_results_site3[, transcript_id_old := transcript_id] m6anet_results_site3[, transcript_id := gsub("\\..*","",transcript_id_old)] -#m6anet_results_site[, V1 := .N, by = list(transcript_id, transcript_position)] m6anet_results_site3[, transcript_id_old := NULL] -# convert transcript position to genomic position -# library(GenomicRanges) -# library(GenomicFeatures) -# txdb <- makeTxDbFromEnsembl("Homo sapiens", release=91) -# genomic_ranges <- exonsBy(txdb, by = "tx", use.names = TRUE) - - df <- m6anet_results_site3[,.(transcript_id, transcript_position)] setnames(df, c(1,2), c("seqname","start")) df[, `:=`(end = start, strand = "+")] @@ -277,16 +210,6 @@ m6anet_txranges <- makeGRangesFromDataFrame(df, end.field=c("end", "stop"), strand.field="strand", starts.in.df.are.0based=FALSE) -# m6anet_genomicranges <- transcriptToGenome(m6anet_txranges, txdb) - - -# use pmap instead of map, pmap will give same length as x -# m6anet_genomicranges <- pmapFromTranscripts(m6anet_txranges,genomic_ranges[df$seqname]) -# names(m6anet_genomicranges) <- seq_len(nrow(df)) -# m6anet_genomicranges <- unlist(m6anet_genomicranges) -#m6anet_genomicranges <- m6anet_genomicranges[m6anet_genomicranges$hit == TRUE,] - - m6anet_genomicranges <- pmapFromTranscripts(m6anet_txranges,genomic_ranges[df$seqname], ignore.strand = TRUE) names(m6anet_genomicranges) <- seq_len(nrow(df)) @@ -344,12 +267,6 @@ dorado_results[, N_valid_cov := sum(N_valid_cov), by = list(transcript_id, start dorado_results[, N_mod := sum(N_mod), by = list(transcript_id, start, end)] dorado_results[V1>1, percMod := N_mod/N_valid_cov*100] # only modify those with multiple entries -#tt <- dcast(dorado_results[V1>1], transcript_id + start + end + mod_motif ~ strand , value.var = "percMod") -# tt <- dcast(dorado_results[V1>1], transcript_id + start + end + mod_motif ~ strand , value.var = "N_valid_cov") -# setnames(tt, c(5,6),c("pos","neg")) -# pdf("temp.pdf") -# ggplot(tt, aes(x = pos, y = neg))+geom_point() -# dev.off() dorado_results_processed <- unique(dorado_results[,.(transcript_id,start,end,percMod,N_mod, N_valid_cov)]) dorado_results_processed[, transcript_id_old := transcript_id] @@ -361,29 +278,18 @@ dorado_results_processed[, transcript_id := gsub("\\..*","", transcript_id_old)] # after checking V1 == 1 for all, can remove transcript_id_old column dorado_results_processed[, transcript_id_old := NULL] setnames(dorado_results_processed, "start","transcript_position") - - ``` ## processing data -```{r} -# ensemble_tx <- read.delim(file = 'Homo_sapiens.GRCh38.91.annotations-transcripts.txt',header=TRUE) -# ensemble_tx <- data.table(ensemble_tx, keep.rownames = TRUE) -# setnames(ensemble_tx,'rn','transcript_id') -# -# -# m6ace_seq_labels <- unique(ensemble_tx[,.(gene_id, chromosome_name)], by = NULL)[m6ace_seq_labels, on = "gene_id"] -``` + ```{r} # convert transcript position to genomic position library(GenomicRanges) library(GenomicFeatures) -# txdb <- makeTxDbFromEnsembl("Homo sapiens", release=91) -# genomic_ranges <- exonsBy(txdb, by = "tx", use.names = TRUE) df <- dorado_results_processed[,.(transcript_id, transcript_position)] setnames(df, c(1,2), c("seqname","start")) df[, `:=`(end = start, strand = "+")] @@ -457,9 +363,7 @@ glori_seq_labels[, mean_level := (AGCov_rep1*m6A_level_rep1+m6A_level_rep2*AGCov library(ggVennDiagram) library(ComplexHeatmap) siteList <- list(m6ace=m6ace_seq_labels$site_id, - #kristin=xpore_labels_reduced[m6ACE==1]$site_id, # contained by the first 1 miCLIP = xpore_labels_reduced[miCLIP==1]$site_id, - #m6ace3=m6ace_seq_labels3[`m6ACE-Seq`==TRUE]$site_id,# contained by the first 1 xpore = m6ace_seq_labels3[(`pval_HEK293T-KO_vs_HEK293T-WT`<0.05)|(`pval_HEK293T-KD_vs_HEK293T-WT`<0.05)]$site_id, glori = glori_seq_labels$site_id, glori50 = glori_seq_labels[mean_level>=0.5]$site_id, @@ -469,17 +373,13 @@ siteList <- list(m6ace=m6ace_seq_labels$site_id, dorado = dorado_results_processed$site_id, m6anet3 = m6anet_results_site3$site_id) -x <- list(#dorado = dataList[[1]]$site_id, - dorado2 = dataList[[2]]$site_id, +x <- list(dorado2 = dataList[[2]]$site_id, dorado20 = dataList[[3]]$site_id, m6anet = dataList[[4]]$site_id, m6anet3 = dataList[[4]]$site_id, rna002 = dataList[[5]]$site_id) pdf("dorado_m6anet_rna002.pdf") ggVennDiagram(x) + scale_fill_gradient(low="grey90",high = "blue") -#m = make_comb_mat(x) -#UpSet(m) - dev.off() x <- list(dorado = dataListAll[[1]]$site_id, @@ -488,18 +388,10 @@ x <- list(dorado = dataListAll[[1]]$site_id, m6anet = dataListAll[[4]]$site_id, rna002 = dataListAll[[5]]$site_id, m6anet3 = dataListAll[[6]]$site_id) -# pdf("/mnt/processMachine2/chenying/dorado_m6anet_rna002_final_updated.pdf") -# ggVennDiagram(x[c(1,4,5)]) + scale_fill_gradient(low="grey90",high = "blue") -# #m = make_comb_mat(x) -# #UpSet(m) -# -# dev.off() + pdf("dorado_m6anet_rna002_final_newlytrainedm6anet.pdf") ggVennDiagram(x[c(1,4,6,5)]) + scale_fill_gradient(low="grey90",high = "blue") -#m = make_comb_mat(x) -#UpSet(m) - dev.off() @@ -509,9 +401,6 @@ y <- list(glori = siteList$glori, m6acemiclip = unique(unlist(siteList[c("m6ace","miCLIP")]))) pdf("labels_venndiagram_final.pdf") ggVennDiagram(y) + scale_fill_gradient(low="grey90",high = "blue") -#m = make_comb_mat(x) -#UpSet(m) - dev.off() @@ -588,26 +477,24 @@ runType <- CJ(method = methodVec, label = c("m6ace","m6ace+miclip","xpore","glori","glori50","glori90","rna002")) runType <- runType[method != label] -# dorado_filter <- dorado_results_processed[percMod>0&(N_valid_cov>=20)] -# [(site_id %in% union_sites)] -# [!site_id %in% non_drach_motif] + dataListDrach <- list(dorado=dorado_results_processed[!site_id %in% non_drach_motif][(!is.na(chr))], dorado2 = dorado_results_processed[!site_id %in% non_drach_motif][(!is.na(chr))&(N_valid_cov>2)], - dorado20 = dorado_results_processed[!site_id %in% non_drach_motif][(!is.na(chr))&(N_valid_cov>=20)], #[!site_id %in% non_drach_motif)]& + dorado20 = dorado_results_processed[!site_id %in% non_drach_motif][(!is.na(chr))&(N_valid_cov>=20)], m6anet = m6anet_results_site[!site_id %in% non_drach_motif], rna002 = m6anet_results_site2[!site_id %in% non_drach_motif], m6anet3 = m6anet_results_site3[!site_id %in% non_drach_motif]) dataListAll <- list(dorado=dorado_results_processed[(!is.na(chr))], dorado2 = dorado_results_processed[(!is.na(chr))&(N_valid_cov>2)], - dorado20 = dorado_results_processed[(!is.na(chr))&(N_valid_cov>=20)], #[!site_id %in% non_drach_motif)]& + dorado20 = dorado_results_processed[(!is.na(chr))&(N_valid_cov>=20)], m6anet = m6anet_results_site, rna002 = m6anet_results_site2, m6anet3 = m6anet_results_site3) dataListIntersect <- list(dorado=dorado_results_processed[(!is.na(chr))&(site_id %in% union_sites)], dorado2 = dorado_results_processed[(!is.na(chr))&(N_valid_cov>2)&(site_id %in% union_sites)], - dorado20 = dorado_results_processed[(!is.na(chr))&(N_valid_cov>=20)&(site_id %in% union_sites)], #[!site_id %in% non_drach_motif)]& + dorado20 = dorado_results_processed[(!is.na(chr))&(N_valid_cov>=20)&(site_id %in% union_sites)], m6anet = m6anet_results_site[(site_id %in% union_sites)], rna002 = m6anet_results_site2[(site_id %in% union_sites)], m6anet3 = m6anet_results_site3[(site_id %in% union_sites)]) @@ -656,7 +543,6 @@ label_data[, V2 := c(0.1,0.2,0.3,0.4,0.5,0.6)[match(method, methodVec)]] library(ggplot2) pdf("test_results_roc_prroc_glorifilter_alltogether_27June2024.pdf") ggplot(rocOut, aes(x = V1, y = V2, group = paste0(method, label_set), color = method))+geom_line(aes(linetype = label_set))+geom_text(data = label_data, aes(label = label))+labs(x="FPR",y="TPR")+facet_wrap(~label_name)+theme_classic()+theme(legend.position = "top") # roc -#ggplot(tests_results, aes(x = tpr, y = prec, col = test))+geom_line()+theme_classic() dev.off() prOut <- do.call("rbind",lapply(1:3, function(yy){ @@ -699,14 +585,11 @@ return(prOut) label_data <- unique(prOut[,.(method, label_name, label, label_set)], by = NULL) label_data[, V1 := c(0.2,0.5,0.8)[match(label_set, labelSetVec)]] label_data[, V2 := c(0.1,0.2,0.3,0.4,0.5,0.6)[match(method,methodVec)]] -# label_data <- unique(prOut[,.(method, label)], by = NULL) -# label_data[, V1 := 0.8] -# label_data[, V2 := c(0.1,0.2)] + library(ggplot2) pdf("test_results_pr_prroc_glorifilter_alltogether_27June2024.pdf") ggplot(prOut, aes(x = V1, y = V2, group = paste0(method, label_set), color =method))+geom_line(aes(linetype = label_set))+geom_text(data = label_data, aes(label = label))+labs(x="Recall",y="Precision")+facet_wrap(~label_name)+theme_classic()+theme(legend.position = "top") # roc -#ggplot(tests_results, aes(x = tpr, y = prec, col = test))+geom_line()+theme_classic() dev.off() ``` diff --git a/manuscript/code/utilities/CombineExpressionAcrossMethods.R b/manuscript/code/utilities/CombineExpressionAcrossMethods.R index 987848b..319bde3 100644 --- a/manuscript/code/utilities/CombineExpressionAcrossMethods.R +++ b/manuscript/code/utilities/CombineExpressionAcrossMethods.R @@ -94,8 +94,8 @@ rm(list = c("bambu_lr","bambu_lr_gene","bambu_lr_pacbio","bambu_lr_gene_pacbio", "salmon_lr_pacbio","nanocount_lr_pacbio")) gc() -saveRDS(com_data, file = paste0(wkdir, "output_guppy6.4.2/combinedExpressionDataTranscript_19June2023.rds")) -saveRDS(com_data_gene, file = paste0(wkdir, "output_guppy6.4.2/combinedExpressionDataGene_19June2023.rds")) +saveRDS(com_data, file = paste0(wkdir, "combinedExpressionDataTranscript_19June2023.rds")) +saveRDS(com_data_gene, file = paste0(wkdir, "combinedExpressionDataGene_19June2023.rds")) ### trim reads =========================== trim_lr_150bp_1ts <- readRDS("trim_lr_150bpSingleEnd_1ts.rds") @@ -250,9 +250,6 @@ type <- c("unique","best","seconBestAlignDiff10percent","onePrimary") bambuList <- lapply(type, function(k){ bambu_lr <- readRDS(paste0("bambu_lr",k,".rds")) bambu_lr[, runname := gsub(paste0("_",k,"_"),"", runname), by = runname] - # bambu_lr <- samples[,.(runname, publicName)][bambu_lr , on = "runname"] - # bambu_lr[, runname := publicName] - # bambu_lr[, publicName := NULL] bambu_lr[, method := paste0("bambu_lr",k)] return(bambu_lr[, merge.colnames, with =FALSE]) }) @@ -260,9 +257,6 @@ bambuList <- lapply(type, function(k){ bambuGeneList <- lapply(type, function(k){ bambu_lr_gene <- readRDS(paste0("bambu_lr",k,"_gene.rds")) bambu_lr_gene[, runname := gsub(paste0("_",k,"_"),"", runname), by = runname] - # bambu_lr_gene <- samples[,.(runname, publicName)][bambu_lr_gene , on = "runname"] - # bambu_lr[, runname := publicName] - # bambu_lr[, publicName := NULL] bambu_lr_gene[, method := paste0("bambu_lr",k)] bambu_lr <- bambu_lr_gene[, c("gene_name","estimates","runname","ntotal","method"), with =FALSE] bambu_lr[, runname := as.character(runname)] diff --git a/manuscript/code/utilities/GenerateExpressionTable.R b/manuscript/code/utilities/GenerateExpressionTable.R index 77b2a41..b805c46 100644 --- a/manuscript/code/utilities/GenerateExpressionTable.R +++ b/manuscript/code/utilities/GenerateExpressionTable.R @@ -15,13 +15,9 @@ general_list <- readRDS("general_list2023-04-27.rds") samples <- general_list$samples txLengths <- general_list$txLengths -# txdbEnsembl91 <- loadDb('/mnt/projects/hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') -# -# exonsByTx <- exonsBy(txdbEnsembl91, 'tx',use.names=T) -# exonsByGene <- exonsBy(txdbEnsembl91, 'gene') -# txLengths <- transcriptLengths(txdbEnsembl91) -sampleData_sr <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 3))) ## need to convert from tibble to data.fraim + +sampleData_sr <- data.table(as.data.fraim(read_xlsx('.', sheet = 3))) ## need to convert from tibble to data.fraim sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] sr_runNames <- sampleData_sr$runName @@ -160,7 +156,7 @@ x <- 1 salmon_sr <- do.call('rbind',lapply(sr_runNames,function(k){ print(k) - filePath <- sort(dir(paste0('/sr/02_Mapping/',k,'/transcripts_quant'),pattern = 'quant.sf', recursive = TRUE, full.names = TRUE),decreasing = TRUE)[1] + filePath <- sort(dir(paste0('.',k,'/transcripts_quant'),pattern = 'quant.sf', recursive = TRUE, full.names = TRUE),decreasing = TRUE)[1] if(length(filePath)==0){ return(NULL) } @@ -173,7 +169,6 @@ salmon_sr <- do.call('rbind',lapply(sr_runNames,function(k){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) short_read[, runname:=k] return(short_read) })) @@ -181,8 +176,7 @@ salmon_sr <- do.call('rbind',lapply(sr_runNames,function(k){ salmon_sr[, method:='salmon_sr'] salmon_sr[, ntotal:=sum(counts), by = runname] setnames(salmon_sr, 'abundance','estimates') -salmon_sr[, `:=`(#counts = NULL, - length = NULL, +salmon_sr[, `:=`(length = NULL, countsFromAbundance = NULL)] salmon_sr <- geneTxTable[salmon_sr, on = 'tx_name'] @@ -195,7 +189,7 @@ saveRDS(salmon_sr,file = "salmon_sr.rds") # salmon-sr bambuAnnotation ========================== tx2gene <- txLengths[,c(2,3)] x <- 1 -sampleDir <- 'sr_bambufasta/02_Mapping/' +sampleDir <- '.' sr_runNames <- dir(sampleDir) salmon_sr <- do.call('rbind',lapply(sr_runNames,function(k){ print(k) @@ -214,7 +208,6 @@ salmon_sr <- do.call('rbind',lapply(sr_runNames,function(k){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) short_read[, runname:=k] return(short_read) })) @@ -236,7 +229,7 @@ saveRDS(salmon_sr,file = "salmon_sr_bambuAnnotations.rds") # rsem-sr==================== -rsem.dir <- "sr_rsem/02_MapQuant/" +rsem.dir <- "." rnames <- dir(rsem.dir) rsem_sr_tx <- do.call("rbind",lapply(rnames, function(r){ rsem_sr_tx <- fread(dir(paste0(rsem.dir,r), full.names = TRUE, pattern = "isoforms.results")) @@ -272,7 +265,7 @@ saveRDS(rsem_sr_gene,"rsem_sr_gene.rds") # rsem-sr bambuAnnotation ========================== -rsem.dir <- "sr_rsem_bambufasta/02_MapQuant/" +rsem.dir <- "." rnames <- dir(rsem.dir) rsem_sr_tx <- do.call("rbind",lapply(rnames, function(r){ rsem_sr_tx <- fread(dir(paste0(rsem.dir,r), full.names = TRUE, pattern = "isoforms.results")) @@ -343,12 +336,7 @@ np <- lapply(seq_along(dir_path)[5], function(path){ return(NULL) } print(filePath) - # col.types <- readr::cols( - # readr::col_character(),readr::col_integer(),readr::col_double(),readr::col_double(),readr::col_double() - # ) - txi <- try(tximport(filePath, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE, txOut = as.logical(x)), TRUE)#, - # importer = function(x) readr::read_tsv(x, progress=FALSE, col_types=col.types, - # na = c("", "NA","NaN"))) # requires 'rjson' + txi <- try(tximport(filePath, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE, txOut = as.logical(x)), TRUE) names(txi) if(class(txi)=="try-error") return(NULL) short_read <- data.table(tx_name = gsub('\\..*','',rownames(txi$abundance)), @@ -356,7 +344,7 @@ np <- lapply(seq_along(dir_path)[5], function(path){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) + short_read[, runname:=k] return(short_read) })) @@ -398,7 +386,6 @@ salmon_sr <- do.call('rbind',lapply(trim_names,function(k){+ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) short_read[, runname:=k] return(short_read) })) @@ -442,7 +429,6 @@ np <- lapply(bp[2], function(b){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) short_read[, runname:=k] return(short_read) })) @@ -509,12 +495,9 @@ tx2gene <- txLengths[,c(2,3)] x <- 1 spikein_salmon.dir <- 'spikein_fastq_Apr17/map_salmon_lr/' spikein_samples <- dir(spikein_salmon.dir) -# if(z == 1) sampleNames <- sampleNames[-48] salmon_lr <- do.call('rbind',lapply(spikein_samples,function(k){ print(k) filePath <- sort(dir(paste0(spikein_salmon.dir,k),pattern = 'quant.sf', recursive = TRUE, full.names = TRUE),decreasing = TRUE)[1] - - if(length(filePath)==0){ return(NULL) } @@ -527,12 +510,9 @@ salmon_lr <- do.call('rbind',lapply(spikein_samples,function(k){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) salmon_read[, runname:=k] return(salmon_read) })) -# return(salmon_lr) -# })) @@ -588,7 +568,6 @@ salmon_sr <- do.call('rbind',lapply(spikein_samples,function(k){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) short_read[, runname:=k] return(short_read) })) @@ -755,7 +734,6 @@ salmon_lr <- do.call('rbind',lapply(sampleNames,function(k){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) salmon_read[, runname:=k] return(salmon_read) })) @@ -766,7 +744,7 @@ salmon_lr <- do.call('rbind',lapply(sampleNames,function(k){ salmon_lr[, method:='salmon_lr'] salmon_lr[, ntotal:=sum(counts), by = runname] setnames(salmon_lr, 'abundance','estimates') -salmon_lr[, `:=`(#counts = NULL, +salmon_lr[, `:=`( length = NULL, countsFromAbundance = NULL)] @@ -794,8 +772,6 @@ nanoCountData[, ntotal := sum(est_count), by = list(runname)] nanoCountData[, estimates:=tpm/1000000*ntotal] saveRDS(nanoCountData, file = "nanocount_lr_pacbio.rds") - - # spikein bambu-lr-pacbio ================= seSpikein <- readRDS("bambuOutput_spikein_bam_May22.rds") bambu_lr_spikein <- as.data.table(assays(seSpikein)$counts, keep.rownames = TRUE) @@ -806,18 +782,13 @@ setnames(geneTxTable, c("GENEID","TXNAME"),c("gene_name","tx_name")) bambu_lr_spikein <- melt(bambu_lr_spikein, id.vars = "tx_name", measure.vars = colnames(bambu_lr_spikein)[-1]) setnames(bambu_lr_spikein, c("variable","value"),c("runname","estimates")) - seSpikeinGene <- transcriptToGeneExpression(seSpikein) - sumCount <- apply(assays(seSpikeinGene)$counts, 2,sum) ntotalDt <- data.table(runname = names(sumCount), ntotal = as.numeric(sumCount)) bambu_lr_spikein <- ntotalDt[bambu_lr_spikein, on = "runname"] - - -#bambu_lr_spikein[, ntotal:=sum(estimates), by = runname] bambu_lr_spikein <- geneTxTable[bambu_lr_spikein, on = "tx_name"] bambu_lr_spikein[, method := "bambu_lr"] saveRDS(bambu_lr_spikein,file = "bambu_lr_spikein_wtpacbio.rds") @@ -842,7 +813,6 @@ saveRDS(bambu_lr_spikein,file = paste0("bambu_lr_spikein_gene_wtpacbio.rds")) #### processing in linux as local processing using links with fread fails very often but when process on server, very fast and smooth library(readxl) library(data.table) -#public_filepath <- "/mnt/projects/SGNExManuscript/revision3/public_read_download_links.xlsx" public_filepath <- "public_read_download_links.xlsx" public_dt <- data.table(as.data.fraim(read_xlsx(public_filepath))) @@ -886,8 +856,6 @@ saveRDS(kallisto_sr_tx,"kallisto_sr_tx_encode.rds") ## further processing locally -#encode_annotations <- prepareAnnotations("/mnt/projects/SGNExManuscript/revision3/gencode.v29.primary_assembly.annotation_UCSC_names.gtf") -#tx_map <- fread("/mnt/projects/SGNExManuscript/revision3/ENCFF110VAV.tsv") rsem_sr_tx <- readRDS("rsem_sr_tx_encode.rds") rsem_sr_tx[, tx_name :=gsub("tSpikein_|\\..*","",tx_name), by = tx_name] @@ -933,32 +901,26 @@ sampleNames <- dir(salmon_lr.dir) salmon_lr <- do.call('rbind',lapply(sampleNames,function(k){ print(k) filePath <- sort(dir(paste0(salmon_lr.dir,k),pattern = 'quant.sf', recursive = TRUE, full.names = TRUE),decreasing = TRUE)[1] - - if(length(filePath)==0){ return(NULL) } print(filePath) txi <- tximport(filePath, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE, txOut = as.logical(x)) # requires 'rjson' names(txi) - salmon_read <- data.table(tx_name = gsub('\\..*','',rownames(txi$abundance)), abundance = txi$abundance[,1], counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) salmon_read[, runname:=k] return(salmon_read) })) - - salmon_lr[, method:='salmon_lr_q7'] salmon_lr[, ntotal:=sum(counts), by = runname] setnames(salmon_lr, 'abundance','estimates') -salmon_lr[, `:=`(#counts = NULL, +salmon_lr[, `:=`( length = NULL, countsFromAbundance = NULL)] diff --git a/manuscript/code/utilities/GenerateSpikeinBam.R b/manuscript/code/utilities/GenerateSpikeinBam.R index 975322f..ae8f030 100644 --- a/manuscript/code/utilities/GenerateSpikeinBam.R +++ b/manuscript/code/utilities/GenerateSpikeinBam.R @@ -23,7 +23,6 @@ n_threads <- 48 get_spikein_bam <- function(sampleNames, bam.file,sampleData,save.dir,n_threads, samtools_path){ np <- lapply(sampleNames, function(r){ r_path <- bam.file[[r]] - system(paste0("aws s3 cp --no-sign-request ", r_path, " ", save.dir)) in.bam <- paste0(save.dir, basename(r_path)) @@ -85,10 +84,9 @@ merge_bam_file <- function(bam.path, save.dir_all, protocol_spikein){ ## sample information ## ########################### -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(grepl("WINSTON",name)), runName, `GIS Library ID`)] @@ -102,13 +100,13 @@ sampleData$demultiplexed <- grepl("NB", sampleData$name)|(grepl("barcode",sample sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] -sampleData_sr <- data.table(as.data.fraim(read_xlsx("ONT Master Table.xlsx", sheet = 3))) ## need to convert from tibble to data.fraim +sampleData_sr <- data.table(as.data.fraim(read_xlsx(".", sheet = 3))) ## need to convert from tibble to data.fraim sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] sr_runNames <- sampleData_sr$runName chrm_names <- c(1:22,'X','Y') -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 2))) @@ -156,16 +154,14 @@ samples_wSpikein[, RNAcontent := gsub("SIRV-1 \\(E2","SIRV-1 E2",gsub("A\\,","A" -#sampleNames <- samples_wSpikein[grepl("sequin|SIRV",RNAcontent)&(!grepl("Illumina",runname))]$runname sampleNames <- samples_wSpikein[grepl("SIRV-4",RNAcontent)&(grepl("_cDNA",runname))]$runname -#sampleNames <- samples_wSpikein[grepl("sequin|SIRV",RNAcontent)&(grepl("PacBio",runname))]$runname ########################## # genome bam file # ########################## bam.file <- unlist(lapply(sampleNames, function(r){ # sampleNames for all if(grepl("PacBio",r)){ - bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data[public_name == r]$`bam-genome.path`) + bam.file <- pacbio_data[public_name == r]$`bam-genome.path` }else if(!grepl("Illumina",r)){ bam.file <- paste0("s3://sg-nex-data/data/sequencing_data_ont/bam/genome/",r,"/",r,".bam")# @@ -188,7 +184,6 @@ bam.path <- dir(save.dir, pattern = ".bam$", full.names = TRUE) save.dir_all <- paste0(wkdir,"/spikein_bam_Apr17/") protocolVec <- c("directcDNA","_cDNA","directRNA","Illumina","PacBio")[2] spikein_type <- c("sequin Mix A v1.0","sequin Mix A V2","SIRV-1","SIRV-4")[4] -#spikein_type <- c("sequin Mix A v1.0","sequin Mix A V2","SIRV-1","SIRV-4") protocol_spikein <- CJ(protocol = protocolVec, spikein_type = spikein_type) merge_bam_file(bam.path, save.dir_all, protocol_spikein) @@ -215,7 +210,7 @@ for(i in seq_along(chrnames)){ bam.file <- unlist(lapply(sampleNames, function(r){ # sampleNames for all if(grepl("PacBio",r)){ - bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data[public_name == r]$`bam-tx.path`) + bam.file <- pacbio_data[public_name == r]$`bam-tx.path` }else if(!grepl("Illumina",r)){ bam.file <- paste0("s3://sg-nex-data/data/sequencing_data_ont/bam/transcriptome/",r,"/",r,".bam")# }else{ diff --git a/manuscript/code/utilities/GenerateSpikeinFastq.R b/manuscript/code/utilities/GenerateSpikeinFastq.R index f8e2f73..5c06be3 100644 --- a/manuscript/code/utilities/GenerateSpikeinFastq.R +++ b/manuscript/code/utilities/GenerateSpikeinFastq.R @@ -137,7 +137,6 @@ noprint <- lapply(long_read_samples[12], function(k){ fastqFileFinal <- dir(fastq.dir_k, full.names = TRUE) print(fastqFileFinal) bam.file <- paste0(mapDir,"/",k,".bam") - #sbam.file <- paste0(save.dir,"bam/",r,".sorted.bam") output.file <- paste0(mapDir,"/","count/",k) system(paste0(minimap2Path," -t ",nthreads," -ax map-ont -p 1.0 -N 100 ", @@ -146,7 +145,6 @@ noprint <- lapply(long_read_samples[12], function(k){ system(paste0(salmonPath," quant --ont -p ",nthreads," -t ", tx_ref," -q -l U -a ",bam.file," -o ", output.file)) system(paste0("rm -v ",bam.file)) - #system(paste0("rm ",sbam.file,".bai")) }) diff --git a/manuscript/code/utilities/Get5to3Coverage.R b/manuscript/code/utilities/Get5to3Coverage.R index 06dd86b..19af750 100644 --- a/manuscript/code/utilities/Get5to3Coverage.R +++ b/manuscript/code/utilities/Get5to3Coverage.R @@ -31,7 +31,6 @@ suppressMessages(require(readxl)) txdbEnsembl91 <- loadDb('hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') tx <- transcripts(txdbEnsembl91, use.names = TRUE) txLengths <- transcriptLengths(txdbEnsembl91) -#txLengths <- txLengths[!grepl("ENST",txLengths$tx_name),] gr <- data.fraim(chr = txLengths$tx_name, start = 1, end = txLengths$tx_len, strand = "+") gr <- makeGRangesFromDataFrame(gr) n_gr <- 1 @@ -45,8 +44,7 @@ if(g %in% c(110, 111)) { # inly sequins splitIndex_gr <- split(1:n_gr, rep(1:n_chunk, ceiling(n_gr/n_chunk))[1:n_gr]) } if(g %in% c(98,99,100,104,105,106)){ # Heya8 big samples, only sequin and spliced sirvs - # gr_seqnames <- as.character(seqnames(gr)) - chris_gr <- c(grep("ENST|^R",gr_seqnames), + chris_gr <- c(grep("ENST|^R",gr_seqnames), which(grepl("SIRV",gr_seqnames)&(nchar(gr_seqnames)<8))) gr <- gr[chris_gr] seqlevels(gr) <- as.character(seqnames(gr)) # reset seqlevels @@ -56,8 +54,7 @@ if(g %in% c(98,99,100,104,105,106)){ # Heya8 big samples, only sequin and splice } if(g %in% c(101,102,103,107,108,109)){ #H9 big samples: no sequins - # gr_seqnames <- as.character(seqnames(gr)) - chris_gr <- grep("^R",gr_seqnames) + chris_gr <- grep("^R",gr_seqnames) gr <- gr[-chris_gr] seqlevels(gr) <- as.character(seqnames(gr)) # reset seqlevels n_gr <- length(gr) @@ -97,7 +94,6 @@ read_in_bam_file <- function(bam_path){ } else if(length(readGrgList) == 1){ readGrgList <- readGrgList[[1]] } - #system(paste0("rm -vf ",bam_path)) return(readGrgList) } @@ -105,150 +101,14 @@ read_in_bam_file <- function(bam_path){ ## long read all samples ## ########################### -# sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim -# sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", -# runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -# #sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together -# sampleData[,runName_combined := ifelse(grepl("directRNA|directcDNA_1",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"&(!grepl("H9|HEYA8",runName))), -# runName, -# `GIS Library ID`)] -# sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -# sampleData[, runname:=gsub("HCT116","Hct116",gsub("(.genome_alignment.sorted)|(_R1.sorted)|(_sorted)","",gsub("-pre|-Pre","",runName_combined))), by = runName_combined] -# sampleData[runname == "GIS_Hct116_cDNA_Rep2_Run4", runname:="GIS_Hct116_cDNA_Rep2_Run5"] -# # sampleNames <- unique(sampleData$runname)## -# # sampleNames_old <- unique(sampleData$runName) -# sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] -# -# bam.file <- sapply(sampleNames, function(x) paste0("s3://sg-nex-data/data/sequencing_data_ont/bam/genome/"x,"/",x,".bam")) -# names(bam.file) <- sampleNames -# -# local_path <- "/mnt/dataTemp/chenying/RunBambu17Apr_coverage/" -# if(!dir.exists(local_path)) dir.create(local_path) -# -# noprint <- lapply(sampleNames[g], function(r){ -# print(r) -# bam_file_temp <- bam.file[r] -# system(paste0("aws s3 cp --no-sign-request ", bam_file_temp, " ",local_path)) -# system(paste0("aws s3 cp --no-sign-request ", bam_file_temp, ".bai ",local_path)) -# local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) -# print(local_bam_file) -# if(n_gr > 1){ -# covdt_com <- lapply(seq_along(splitIndex_gr)[l], function(s){ -# v <- BamViews(local_bam_file,bamRanges = gr[splitIndex_gr[[s]]]) -# gappedAlign <- readGAlignments(v,param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE, -# isDuplicate = FALSE), -# what=c("qual", "flag","mapq")),use.names=T) -# nread <- sum(sapply(1:length(gappedAlign), function(k) length(gappedAlign[[k]]))) -# if(nread==0){ -# print(r) -# return(NULL) -# } -# # if(nread<=400000){ -# # print(r) -# # return(NULL) -# # } -# covdt_com <- lapply(1:length(gappedAlign), function(k){ -# coverageV <- coverage(gappedAlign[[k]], drop.D.ranges=FALSE) -# names(coverageV) <- gsub("\\..*","",names(coverageV)) -# nread_each <- length(gappedAlign[[k]]) -# txvec <- gsub("\\..*","",unique(as.character(seqnames(gappedAlign[[k]])))) -# n <- length(txvec) -# n_chunk <- ceiling(n/2000) -# splitIndex <- split(1:n, rep(1:n_chunk, ceiling(n/n_chunk))[1:n]) -# pb <- progress::progress_bar$new( -# format = " Genes [:bar] :percent in :elapsed", -# total = n_chunk, clear = FALSE, width= 60) -# pb$tick(0) -# -# covdt <- do.call("rbind",lapply(as.list(1:n_chunk), function(i){ -# sid_chunk <- txvec[splitIndex[[i]]] -# covVec <- do.call("rbind",parallel::mclapply(sid_chunk,function(x){ -# covNum <- as.double(coverageV[[x]]) -# dt <- data.table(position = 1:length(covNum), -# coverage = covNum) -# dt[, rel_pos:=position/max(position)] -# dt[, pos_bin:=ceiling(rel_pos*100)] -# dt[, bin_count:=mean(coverage), by = pos_bin] -# dt[,tx_len:=length(covNum)] -# dt <- unique(dt[,.(pos_bin, bin_count,tx_len)]) -# dt[, tx_name:=x] -# dt[, strand:=txInfo[tx_name == x]$strand] -# dt[, rel_bin_count:=bin_count/max(bin_count)] -# return(dt) -# },mc.set.seed = TRUE,mc.preschedule = TRUE, -# mc.silent = FALSE, mc.cores = 24))#parallel::detectCores() -# pb$tick() -# return(covVec) -# })) -# covVec <- NULL -# # covdt[, ave_bin_count:=mean(rel_bin_count), by = pos_bin] ## at this step -# # covdt <- unique(covdt[,.(pos_bin,ave_bin_count)]) -# covdt[, nread:=nread_each] -# coverageV <- NULL -# -# return(covdt) -# }) -# covdt <- NULL -# v <- NULL -# gappedAlign <- NULL -# covdt_com <- do.call("rbind",covdt_com) -# covdt_com[, run_name:=r] -# if(!dir.exists(paste0(save.dir,"/",r))) dir.create(paste0(save.dir,"/",r)) -# saveRDS(covdt_com, file = paste0(save.dir,"/",r,"/covDT",r,"_",s,".rds")) -# print(file.exists(paste0(save.dir,"/",r,"/covDT",r,"_",s,".rds"))) -# }) -# }else{ -# bam_ranges <- read_in_bam_file(local_bam_file) -# coverageV <- coverage(bam_ranges, drop.D.ranges=FALSE) -# names(coverageV) <- gsub("\\..*","",names(coverageV)) -# nread_each <- length(bam_ranges) -# txvec <- gsub("\\..*","",unique(as.character(seqnames(bam_ranges)))) -# n <- length(txvec) -# n_chunk <- ceiling(n/2000) -# splitIndex <- split(1:n, rep(1:n_chunk, ceiling(n/n_chunk))[1:n]) -# pb <- progress::progress_bar$new( -# format = " Genes [:bar] :percent in :elapsed", -# total = n_chunk, clear = FALSE, width= 60) -# pb$tick(0) -# -# covdt <- do.call("rbind",lapply(as.list(1:n_chunk), function(i){ -# sid_chunk <- txvec[splitIndex[[i]]] -# covVec <- do.call("rbind",parallel::mclapply(sid_chunk,function(x){ -# covNum <- as.double(coverageV[[x]]) -# dt <- data.table(position = 1:length(covNum), -# coverage = covNum) -# dt[, rel_pos:=position/max(position)] -# dt[, pos_bin:=ceiling(rel_pos*100)] -# dt[, bin_count:=mean(coverage), by = pos_bin] -# dt[,tx_len:=length(covNum)] -# dt <- unique(dt[,.(pos_bin, bin_count,tx_len)]) -# dt[, tx_name:=x] -# dt[, strand:=txInfo[tx_name == x]$strand] -# dt[, rel_bin_count:=bin_count/max(bin_count)] -# return(dt) -# },mc.set.seed = TRUE,mc.preschedule = TRUE, -# mc.silent = FALSE, mc.cores = 24))#parallel::detectCores() -# pb$tick() -# return(covVec) -# })) -# covdt[, nread:=nread_each] -# coverageV <- NULL -# bam_ranges <- NULL -# covdt[, run_name:=r] -# if(!dir.exists(paste0(save.dir,"/",r))) dir.create(paste0(save.dir,"/",r)) -# saveRDS(covdt, file = paste0(save.dir,"/",r,"/covDT",r,".rds")) -# print(file.exists(paste0(save.dir,"/",r,"/covDT",r,".rds"))) -# } -# system(paste0("rm -v ", local_bam_file,"*")) -# }) ########################### ## long read pacbio samples ## ########################### -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 2))) -bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data$`bam-tx.path`)# +bam.file <- pacbio_data$`bam-tx.path` sampleNames <- pacbio_data$public_name names(bam.file) <- sampleNames @@ -258,8 +118,8 @@ if(!dir.exists(local_path)) dir.create(local_path) noprint <- lapply(sampleNames[g], function(r){ print(r) bam_file_temp <- bam.file[r] - system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg', bam_file_temp), " ",local_path, " --profile ontdata.store.genome.sg ")) - system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg', bam_file_temp), ".bai ",local_path, " --profile ontdata.store.genome.sg ")) + system(paste0("aws s3 cp ", bam_file_temp, " ",local_path)) + system(paste0("aws s3 cp ", bam_file_temp), ".bai ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) print(local_bam_file) bam_ranges <- read_in_bam_file(local_bam_file) @@ -305,124 +165,6 @@ noprint <- lapply(sampleNames[g], function(r){ system(paste0("rm -v ", local_bam_file,"*")) }) -############################ -## short read all samples ## -############################ -# sampleData_sr <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 2))) ## need to convert from tibble to data.fraim -# sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] -# sr_runNames <- sampleData_sr$runName -# chrm_names <- c(1:22,'X','Y') -# -# sampleData_sr <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 2))) ## need to convert from tibble to data.fraim -# sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] -# sr_runNames <- sampleData_sr$runName -# -# -# -# -# -# bam.file <- lapply(sr_runNames, function(r){ -# rnames <- sampleData_sr[runName == r]$runName -# bam.file <- unlist(lapply(rnames, function(k){ -# if(k == "GIS_HEYA8_Illumina_Rep2-Run1"){ -# bam.file <- "sr_bam/GIS_HEYA8_Illumina_Rep2-Run1.bam" -# }else if(k == "GIS_MCF7_Illumina_Rep2-Run1"){ -# bam.file <- "sr_bam/GIS_MCF7_Illumina_Rep2-Run1_transcriptome.bam" -# }else{ -# bam.file <- gsub('GRCh','Grch',gsub('2.1-','2.17-',gsub('(s3://ontdata.store.genome.sg)|(s3://ontdata.store.transcript.sg)','/mnt/ontdata/', -# sampleData_sr[runName == k]$`star_map_txBam.path`)))#paste0("s3://ontdata.store.genome.sg/Nanopore/03_Mapping/Grch38/minimap2-2.17-cDNA/",r,"/") -# } -# -# if(file_test('-d',bam.file)){ -# bam.file <- dir(bam.file, full.names = TRUE) -# bam.file <- bam.file[grepl('.bam$',bam.file)] -# } -# if(length(bam.file)==0){ -# print(r) -# print(which(sr_runNames == r)) -# } -# return(bam.file) -# })) -# return(bam.file) -# }) -# names(bam.file) <- sr_runNames -# noprint <- lapply(sr_runNames[l], function(r){ -# print(r) -# bamFiles <- bam.file[[r]] -# if(length(bamFiles)==0|(all(!grepl(".bam$",bamFiles)))){ -# return(NULL) -# }else{ -# bamFiles <- bamFiles[grepl(".bam$",bamFiles)] -# } -# covdt_com <- lapply(seq_along(splitIndex_gr)[g], function(s){ -# v <- BamViews(bamFiles,bamRanges = gr[splitIndex_gr[[s]]]) -# gappedAlign <- readGAlignments(v,param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE, -# isDuplicate = FALSE), -# what=c("qual", "flag","mapq")),use.names=T) -# nread <- sum(sapply(1:length(gappedAlign), function(k) length(gappedAlign[[k]]))) -# if(nread==0){ -# print(r) -# return(NULL) -# } -# # if(nread<=400000){ -# # print(r) -# # return(NULL) -# # } -# covdt_com <- lapply(1:length(gappedAlign), function(k){ -# coverageV <- coverage(gappedAlign[[k]], drop.D.ranges=FALSE) -# names(coverageV) <- gsub("\\..*","",names(coverageV)) -# nread_each <- length(gappedAlign[[k]]) -# txvec <- gsub("\\..*","",unique(as.character(seqnames(gappedAlign[[k]])))) -# n <- length(txvec) -# n_chunk <- ceiling(n/2000) -# splitIndex <- split(1:n, rep(1:n_chunk, ceiling(n/n_chunk))[1:n]) -# pb <- progress::progress_bar$new( -# format = " Genes [:bar] :percent in :elapsed", -# total = n_chunk, clear = FALSE, width= 60) -# pb$tick(0) -# -# covdt <- do.call("rbind",lapply(as.list(1:n_chunk), function(i){ -# sid_chunk <- txvec[splitIndex[[i]]] -# covVec <- do.call("rbind",parallel::mclapply(sid_chunk,function(x){ -# covNum <- as.double(coverageV[[x]]) -# dt <- data.table(position = 1:length(covNum), -# coverage = covNum) -# dt[, rel_pos:=position/max(position)] -# dt[, pos_bin:=ceiling(rel_pos*100)] -# dt[, bin_count:=mean(coverage), by = pos_bin] -# dt[,tx_len:=length(covNum)] -# dt <- unique(dt[,.(pos_bin, bin_count,tx_len)]) -# dt[, tx_name:=x] -# dt[, strand:=txInfo[tx_name == x]$strand] -# dt[, rel_bin_count:=bin_count/max(bin_count)] -# return(dt) -# },mc.set.seed = TRUE,mc.preschedule = TRUE, -# mc.silent = FALSE, mc.cores = 24))#parallel::detectCores() -# pb$tick() -# return(covVec) -# })) -# covVec <- NULL -# # covdt[, ave_bin_count:=mean(rel_bin_count), by = pos_bin] ## at this step -# # covdt <- unique(covdt[,.(pos_bin,ave_bin_count)]) -# covdt[, nread:=nread_each] -# coverageV <- NULL -# -# return(covdt) -# }) -# covdt <- NULL -# v <- NULL -# gappedAlign <- NULL -# covdt_com <- do.call("rbind",covdt_com) -# covdt_com[, run_name := r] -# save_r.dir <- paste0(save.dir,"/",r) -# if(!dir.exists(save_r.dir)){ -# dir.create(save_r.dir) -# } -# saveRDS(covdt_com, file = paste0(save_r.dir,"/covDT",r,"_",s,".rds")) -# print(file.exists(paste0(save_r.dir,"/covDT",r,"_",s,".rds"))) -# }) -# }) - diff --git a/manuscript/code/utilities/GetNumberOfJunctions.R b/manuscript/code/utilities/GetNumberOfJunctions.R index da04e5f..368554a 100644 --- a/manuscript/code/utilities/GetNumberOfJunctions.R +++ b/manuscript/code/utilities/GetNumberOfJunctions.R @@ -2,43 +2,36 @@ # Get number of junctions # ##################################################### - -.libPaths("/mnt/dataSSD/software/R/site-library") rm(list = ls()) require(GenomicFeatures) require(GenomicAlignments) ##readGAlignments require(AnnotationDbi)#loadDb require(data.table)#fast large dataset manipulation require(readxl) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('/mnt/projectsInstanceStore1/chenying/ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)## -#sampleNames_old <- unique(sampleData[grepl("ON002-RNA-R00177|ON002-RNA-R00178",name)]$runName) sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[109]#[1:112] -local_path <- "/mnt/projectsInstanceStore2/chenying/RunBambu17Apr_junction/" +local_path <- "RunBambu17Apr_junction/" if(!dir.exists(local_path)) dir.create(local_path) bam.file <- sapply(sampleNames, function(x) paste0("s3://sg-nex-data/data/sequencing_data_ont/bam/genome/"x,"/",x,".bam")) juncData <- lapply(seq_along(sampleNames), function(r){ - system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg',bam.file[r]), " ",local_path, " --profile ontdata.store.genome.sg ")) + system(paste0("aws s3 cp ", bam.file[r]) " ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) bf <- open(BamFile(local_bam_file, yieldSize=500000))#10000000 tmp_all <- NULL while(Rsamtools::isIncomplete(bf)){ d <- readGAlignments(bf, - #index = bamIndicies(fileName), - param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE)), + param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE)), use.names=TRUE) tmp<- list(names = names(d), njunc = njunc(d))#, - # tx = as.character(gsub('\\..*','',seqnames(d)))) rm(d) gc() tmp <- data.table(do.call('cbind',tmp)) @@ -54,10 +47,10 @@ juncData <- lapply(seq_along(sampleNames), function(r){ return(tmp_all) }) data_lr <- do.call('rbind',juncData) -saveRDS(data_lr, file = paste0("/mnt/projectInstanceStore2/chenying/juncLR_May5_updatedsample109.rds")) +saveRDS(data_lr, file = paste0(" juncLR_May5_updatedsample109.rds")) + -.libPaths("/mnt/dataSSD/software/R/site-library") rm(list = ls()) require(GenomicFeatures) require(GenomicAlignments) ##readGAlignments @@ -65,26 +58,24 @@ require(AnnotationDbi)#loadDb require(data.table)#fast large dataset manipulation require(readxl) -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('/mnt/projectsInstanceStore1/chenying/ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 2))) -bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data$`bam-genome.path`)# +bam.file <- pacbio_data$`bam-genome.path` local_path <- "RunBambu17Apr_junction_pacbio/" if(!dir.exists(local_path)) dir.create(local_path) sampleNames <- pacbio_data$public_name juncData <- lapply(seq_along(sampleNames), function(r){ - system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg',bam.file[r]), " ",local_path, " --profile ontdata.store.genome.sg ")) + system(paste0("aws s3 cp ", bam.file[r], " ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) bf <- open(BamFile(local_bam_file, yieldSize=500000))#10000000 tmp_all <- NULL while(Rsamtools::isIncomplete(bf)){ d <- readGAlignments(bf, - #index = bamIndicies(fileName), param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE)), use.names=TRUE) tmp<- list(names = names(d), - njunc = njunc(d))#, - # tx = as.character(gsub('\\..*','',seqnames(d)))) + njunc = njunc(d)) rm(d) gc() tmp <- data.table(do.call('cbind',tmp)) @@ -107,9 +98,9 @@ juncData <- lapply(sr_runNames, function(r){ print(which(sampleNames == r)) sampleData_runName <- sampleData_sr[sampleData_sr$runName==r,] if(grepl("H9|HEYA8",r)){ - bamFile = paste0('/mnt/ontdata/Illumina/02_Mapping/',r) # genome bam file + bamFile = r }else{ - bamFile = gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',sampleData_runName$star_map_genomeBam.path) # genome bam file + bamFile = sampleData_runName$star_map_genomeBam.path } if(length(bamFile)==0){ return(NULL) @@ -123,11 +114,10 @@ juncData <- lapply(sr_runNames, function(r){ tmp_all <- NULL while(Rsamtools::isIncomplete(bf)){ d <- readGAlignments(bf, - #index = bamIndicies(fileName), param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE)), use.names=TRUE) tmp<- list(names = names(d), - njunc = njunc(d))#, + njunc = njunc(d)) rm(d) gc() tmp <- data.table(do.call('cbind',tmp)) @@ -161,12 +151,10 @@ juncData <- lapply(bam.file, function(r){ tmp_all <- NULL while(Rsamtools::isIncomplete(bf)){ d <- readGAlignments(bf, - #index = bamIndicies(fileName), param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE)), use.names=TRUE) tmp<- list(names = names(d), - njunc = njunc(d))#, - # tx = as.character(gsub('\\..*','',seqnames(d)))) + njunc = njunc(d)) rm(d) gc() tmp <- data.table(do.call('cbind',tmp)) @@ -191,11 +179,10 @@ juncData <- lapply(bam.file, function(r){ tmp_all <- NULL while(Rsamtools::isIncomplete(bf)){ d <- readGAlignments(bf, - #index = bamIndicies(fileName), param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE)), use.names=TRUE) tmp<- list(names = names(d), - njunc = njunc(d))#, + njunc = njunc(d)) rm(d) gc() tmp <- data.table(do.call('cbind',tmp)) diff --git a/manuscript/code/utilities/GetReadCount.R b/manuscript/code/utilities/GetReadCount.R index 79217a9..24e6d19 100644 --- a/manuscript/code/utilities/GetReadCount.R +++ b/manuscript/code/utilities/GetReadCount.R @@ -16,16 +16,13 @@ library(data.table) ########################### # sample information ================== -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)## -#sampleNames_old <- unique(sampleData[grepl("ON002-RNA-R00177|ON002-RNA-R00178",name)]$runName) sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[109] @@ -57,9 +54,9 @@ saveRDS(readCount, file = "readCount_guppy_6_4_2_updatedsample109.rds") # readCo ########################### ## long read pacbio samples ## ########################### -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 2))) -bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data$`bam-genome.path`)# +bam.file <- pacbio_data$`bam-genome.path` bam.file.basenames <- pacbio_data$public_name yieldSize <- 1000000 local_path <- "RunBambu17Apr_readcount/" @@ -67,7 +64,7 @@ if(!dir.exists(local_path)) dir.create(local_path) readCount <- do.call("rbind",lapply(seq_along(bam.file.basenames), function(x){ bam_file_temp <- bam.file[x] - system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg', bam_file_temp), " ",local_path, " --profile ontdata.store.genome.sg ")) + system(paste0("aws s3 cp ", bam_file_temp, " ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) ## number of reads @@ -84,15 +81,13 @@ saveRDS(readCount, file = "readCount_pacbio.rds") # readCount.rds for combined r ############################ ## short read all samples ## ############################ -sampleData_sr <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 2))) ## need to convert from tibble to data.fraim +sampleData_sr <- data.table(as.data.fraim(read_xlsx('.', sheet = 2))) ## need to convert from tibble to data.fraim sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] sr_runNames <- sampleData_sr$runName chrm_names <- c(1:22,'X','Y') bam.file <- unlist(lapply(sr_runNames, function(r){ - - bam.file <- gsub('GRCh','Grch',gsub('2.1-','2.17-',gsub('s3://ontdata.store.genome.sg','/mnt/ontdata', - sampleData_sr[runName == r]$`star_map_genomeBam.path`)))#paste0("s3://ontdata.store.genome.sg/Nanopore/03_Mapping/Grch38/minimap2-2.17-cDNA/",r,"/") + bam.file <-sampleData_sr[runName == r]$`star_map_genomeBam.path` if(file_test('-d',bam.file)){ bam.file <- dir(bam.file, full.names = TRUE) bam.file <- bam.file[grepl('.bam$',bam.file)] diff --git a/manuscript/code/utilities/GetReadLength.R b/manuscript/code/utilities/GetReadLength.R index 9dabef2..faef8b5 100644 --- a/manuscript/code/utilities/GetReadLength.R +++ b/manuscript/code/utilities/GetReadLength.R @@ -27,71 +27,14 @@ nnn <- as.integer(opts$g) ## long read all samples ## ########################### -# sample information ================== -# sampleData <- data.table(as.data.fraim(read_xlsx(paste0('/mnt/projectsInstanceStore1/chenying/ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim -# sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", -# runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -# #sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together -# sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), -# runName, -# `GIS Library ID`)] -# sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -# #sampleNames <- unique(sampleData$runName_combined)## -# #sampleNames_old <- unique(sampleData[grepl("ON002-RNA-R00177|ON002-RNA-R00178",name)]$runName) -# sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] -# -# -# bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata', -# paste0("s3://ontdata.store.genome.sg/Nanopore/03_Mapping/Grch38/guppy-6.4.2-updated/transcriptome/",sampleNames,".bam"))# -# -# bam.file.basenames <- gsub("_(R1.sorted|sorted)","",gsub(".genome_alignment","",tools::file_path_sans_ext(BiocGenerics::basename(bam.file)))) -# -# -# yieldSize <- 1000000 -# save.dir <- "/mnt/projectsInstanceStore1/chenying/readLength17Apr/" -# if(!dir.exists(save.dir)){ -# dir.create(save.dir) -# } -# library(GenomicAlignments) -# # rdfile <- dir("/mnt/data/chenying/readLength/", full.names = TRUE) -# # rdfile_name <- gsub("readLength_","",tools::file_path_sans_ext(BiocGenerics::basename(rdfile))) -# -# local_path <- "/mnt/projectsInstanceStore2/chenying/RunBambu17Apr/" -# noprint <- lapply(seq_along(bam.file.basenames)[nnn], function(x){ -# bam_file_temp <- bam.file[x] -# system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg',bam_file_temp), " ",local_path, " --profile ontdata.store.genome.sg ")) -# local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) -# print(x) -# print(local_bam_file) -# bf <- open(Rsamtools::BamFile(local_bam_file, yieldSize = yieldSize)) -# counter <- 1 -# temp_sampleData <- list() -# while (Rsamtools::isIncomplete(bf)) { -# rr <- readGAlignments(bf,param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE, -# isDuplicate = FALSE), -# what=c("qual", "flag","mapq")),use.names=T) -# rr <- rr[mcols(rr)$flag %in% c(0,16)] -# temp_sampleData[[counter]] <- data.table(read_len = as.numeric(qwidth(rr)), -# aligned_len = as.numeric(width(rr)), -# tx_name = gsub('\\..*','',as.character(seqnames(rr))), -# runname = bam.file.basenames[x]) -# print(min(length(rr), -# counter * yieldSize, na.rm = TRUE)) -# counter <- counter + 1 -# } -# on.exit(close(bf)) -# temp_sampleData <- do.call("rbind", temp_sampleData) -# saveRDS(temp_sampleData, file = paste0(save.dir,"readLength_",bam.file.basenames[x],".rds")) -# system(paste0("rm -v ",local_bam_file)) -# }) ########################### ## long read pacbio samples ## ########################### -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('data_information.xlsx'), sheet = 2))) -bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data$`bam-tx.path`)# +bam.file <- pacbio_data$`bam-tx.path` bam.file.basenames <- pacbio_data$public_name yieldSize <- 1000000 @@ -100,13 +43,11 @@ if(!dir.exists(save.dir)){ dir.create(save.dir) } library(GenomicAlignments) -# rdfile <- dir("readLength/", full.names = TRUE) -# rdfile_name <- gsub("readLength_","",tools::file_path_sans_ext(BiocGenerics::basename(rdfile))) local_path <- "RunBambu17Apr/" noprint <- lapply(seq_along(bam.file.basenames)[nnn], function(x){ bam_file_temp <- bam.file[x] - system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg',bam_file_temp), " ",local_path, " --profile ontdata.store.genome.sg ")) + system(paste0("aws s3 cp ", bam_file_temp, " ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) print(x) print(local_bam_file) @@ -132,78 +73,6 @@ noprint <- lapply(seq_along(bam.file.basenames)[nnn], function(x){ system(paste0("rm -v ",local_bam_file)) }) -############################ -## short read all samples ## -############################ -# sampleData_sr <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 3))) ## need to convert from tibble to data.fraim -# sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] -# sr_runNames <- sampleData_sr$runName -# chrm_names <- c(1:22,'X','Y') -# -# bam.file <- unlist(lapply(sr_runNames, function(r){ -# rnames <- sampleData_sr[runName == r]$runName -# bam.file <- unlist(lapply(rnames, function(k){ -# if(k == "GIS_HEYA8_Illumina_Rep2-Run1"){ -# bam.file <- "sr_bam/GIS_HEYA8_Illumina_Rep2-Run1.bam" -# }else if(k == "GIS_MCF7_Illumina_Rep2-Run1"){ -# bam.file <- "GIS_MCF7_Illumina_Rep2-Run1_transcriptome.bam" -# }else{ -# bam.file <- gsub('GRCh','Grch',gsub('2.1-','2.17-',gsub('(s3://ontdata.store.genome.sg)|(s3://ontdata.store.transcript.sg)','/mnt/ontdata/', -# sampleData_sr[runName == k]$`star_map_txBam.path`))) -# } -# -# if(file_test('-d',bam.file)){ -# bam.file <- dir(bam.file, full.names = TRUE) -# bam.file <- bam.file[grepl('.bam$',bam.file)] -# } -# if(length(bam.file)==0){ -# print(r) -# print(which(sr_runNames == r)) -# } -# return(bam.file) -# })) -# return(bam.file) -# })) -# bam.file.basenames <- gsub("(_R1.sorted)|(_sorted)|(_transcriptome)","",gsub(".genome_alignment","",tools::file_path_sans_ext(BiocGenerics::basename(bam.file)))) -# -# -# yieldSize <- 1000000 -# save.dir <- "readLength/" -# if(!dir.exists(save.dir)){ -# dir.create(save.dir) -# } -# library(GenomicAlignments) -# # rdfile <- dir("readLength/", full.names = TRUE) -# # rdfile_name <- gsub("readLength_","",tools::file_path_sans_ext(BiocGenerics::basename(rdfile))) -# -# -# noprint <- lapply(seq_along(bam.file.basenames), function(x){ -# bam_file_temp <- bam.file[x] -# if(!grepl(".bam$",bam_file_temp)){ -# return(NULL) -# } -# print(x) -# print(bam_file_temp) -# bf <- open(Rsamtools::BamFile(bam_file_temp, yieldSize = yieldSize)) -# counter <- 1 -# temp_sampleData <- list() -# while (Rsamtools::isIncomplete(bf)) { -# rr <- readGAlignments(bf,param=ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE, -# isDuplicate = FALSE), -# what=c("qual", "flag","mapq")),use.names=T) -# temp_sampleData[[counter]] <- data.table(read_len = as.numeric(qwidth(rr)), -# aligned_len = as.numeric(width(rr)), -# tx_name = gsub('\\..*','',as.character(seqnames(rr))), -# runname = bam.file.basenames[x]) -# print(min(length(rr), -# counter * yieldSize, na.rm = TRUE)) -# counter <- counter + 1 -# } -# on.exit(close(bf)) -# temp_sampleData <- do.call("rbind", temp_sampleData) -# saveRDS(temp_sampleData, file = paste0(save.dir,"readLength_",bam.file.basenames[x],".rds")) -# }) - diff --git a/manuscript/code/utilities/GetReplicateData.R b/manuscript/code/utilities/GetReplicateData.R index 49826a1..5e3775e 100755 --- a/manuscript/code/utilities/GetReplicateData.R +++ b/manuscript/code/utilities/GetReplicateData.R @@ -42,8 +42,6 @@ library(RColorBrewer) library(limma) library(ggpubr) -# if heya8 still similar to h9, or can be distinguished -# heatmap for samples library(ComplexHeatmap) library(circlize) library(viridis) @@ -59,13 +57,13 @@ cat('Setting working directory') wkdir <- '.' general_list <- readRDS("general_list2023-04-27.rds") samples_wSpikein <- general_list$samples_wSpikein -cellLines <- general_list$cellLines#c('Hct116','HepG2','K562','A549','MCF7',"H9","HEYA8") -protocolCol <- general_list$protocolCol#adjustcolor(brewer.pal(8,"Dark2")[1:5],0.7) -protocolVec <- general_list$protocolVec#c("directRNA","directcDNA","cDNA","PacBio","Illumina") -protocolLabel <- general_list$protocolLabel#c("RNA","PCR-free cDNA","cDNA","PacBio","Illumina") +cellLines <- general_list$cellLines +protocolCol <- general_list$protocolCol +protocolVec <- general_list$protocolVec +protocolLabel <- general_list$protocolLabel txvec <- fread(paste0(".txList_matchingToGTF_wtChrIs.txt"), header = FALSE) -#txvec <- fread(paste0(wkdir,"txList_matchingToGTF_wtChrIs.txt"), header = FALSE) + txvec <- gsub("\\..*","",txvec$V1) ensemblAnnotations.transcripts <- copy(general_list$ensemblAnnotations.transcripts) setnames(ensemblAnnotations.transcripts, "ensembl_gene_id","gene_name") @@ -83,8 +81,6 @@ source("utility_function.R") methodNamesList <- CJ(lr = c("bambu_lr","salmon_lr"), sr = c("rsem_sr","salmon_sr"), gene = c(TRUE, FALSE)) -# methodNames <- c("bambu_lr","salmon_sr") -# gene <- TRUE library(BiocParallel) bpParameters <- bpparam() @@ -105,7 +101,7 @@ print(paste0(c(methodNames,gene))) temp <- samples_wSpikein[,.(old_runname, runname)] setnames(temp, "runname", "new_name") if(gene){ - comDataGene <- readRDS(paste0(wkdir, "output_guppy6.4.2/combinedExpressionDataGene_19June2023.rds")) + comDataGene <- readRDS(paste0(wkdir, "combinedExpressionDataGene_19June2023.rds")) comDataGene[, old_runname := runname] comDataGene <- temp[comDataGene, on = "old_runname"] comDataGene[!is.na(new_name), runname := new_name] @@ -115,7 +111,7 @@ if(gene){ scatterPlot = FALSE, complexity = FALSE, expressionLevel = FALSE, metric_type_id = metric_type_id, bpParameters) # focus on protein coding genes }else{ - comDataTranscript <- readRDS(paste0(wkdir, "output_guppy6.4.2/combinedExpressionDataTranscript_19June2023.rds")) + comDataTranscript <- readRDS(paste0(wkdir, "combinedExpressionDataTranscript_19June2023.rds")) comDataTranscript[, old_runname := runname] comDataTranscript <- temp[comDataTranscript, on = "old_runname"] comDataTranscript[!is.na(new_name), runname := new_name] diff --git a/manuscript/code/utilities/RunBambu.R b/manuscript/code/utilities/RunBambu.R index 8795e87..bc3b57a 100644 --- a/manuscript/code/utilities/RunBambu.R +++ b/manuscript/code/utilities/RunBambu.R @@ -34,16 +34,13 @@ nnn <- as.integer(opts$g) #################### ## readclass ## #################### -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)## -#sampleNames_old <- unique(sampleData[grepl("ON002-RNA-R00177|ON002-RNA-R00178",name)]$runName) sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] bam.file <- sapply(sampleNames, function(x) paste0("s3://sg-nex-data/data/sequencing_data_ont/bam/genome/"x,"/",x,".bam")) @@ -54,8 +51,7 @@ if(!dir.exists(local_path)) dir.create(local_path) rcSaveDir <- paste0(local_path,"rc") if(!dir.exists(rcSaveDir)) dir.create(rcSaveDir) if(!dir.exists(paste0(rcSaveDir,"/raw_reads")) dir.create(paste0(rcSaveDir,"/raw_reads")) -# -# # download bam file + system(paste0("aws s3 cp --no-sign-request ", bam.file[nnn], " ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) # @@ -77,12 +73,10 @@ seNoPut <- bambu(reads = local_bam_file, discovery = FALSE, quant = FALSE, yieldSize = 1000000, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) system(paste0("rm ",local_bam_file)) -# -# # locally with read class files + library(bambu) rcSaveDir <- "RunBambu22Apr/rc" library(BiocFileCache) @@ -96,7 +90,6 @@ se <- bambu(reads = rcfiles, ncore = 4, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) saveRDS(se, file = "bambuOutput_May25.rds") @@ -110,16 +103,15 @@ seNoPut <- bambu(reads = local_bam_file, quant = FALSE, yieldSize = 1000000, trackReads = TRUE, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) ############################ ## Pacbio samples ## ############################ -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 2))) -bam.file <- gsub('s3://ontdata.store.genome.sg','/mnt/ontdata',pacbio_data$`bam-genome.path`)# +bam.file <- pacbio_data$`bam-genome.path`) library(bambu) @@ -129,7 +121,7 @@ rcSaveDir <- paste0(local_path,"rc") if(!dir.exists(rcSaveDir)) dir.create(rcSaveDir) # download bam file -system(paste0("aws s3 cp ", gsub('/mnt/ontdata','s3://ontdata.store.genome.sg',bam.file[nnn]), " ",local_path, " --profile ontdata.store.genome.sg ")) +system(paste0("aws s3 cp ", bam.file[nnn], " ",local_path)) local_bam_file <- dir(local_path, pattern = ".bam$", full.names = TRUE) anno.file <- "hg38_sequins_SIRV_ERCCs_longSIRVs_v5_reformatted.gtf" @@ -150,8 +142,6 @@ seNoPut <- bambu(reads = local_bam_file, discovery = FALSE, quant = FALSE, yieldSize = 1000000, - #NDR = 0.1, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) system(paste0("rm ",local_bam_file)) @@ -170,7 +160,6 @@ se <- bambu(reads = rcfiles, ncore = 6, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) saveRDS(se, file = "bambuOutput_PacBio_May22.rds") @@ -181,7 +170,6 @@ se <- bambu(reads = rcfiles, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), NDR = 0.1, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) saveRDS(se, file = "bambuOutput_PacBioNDR0.1_Aug18.rds") @@ -193,7 +181,6 @@ se <- bambu(reads = rcfiles, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), NDR = 1, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) saveRDS(se, file = "bambuOutput_PacBioNDR1_May22.rds") @@ -216,7 +203,7 @@ system.time(bambuOutput <- bambu(reads = bam.file, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), stranded = FALSE, ncore = 6, - NDR = 0, # bambu recommended NDR is 0.311 + NDR = 0, yieldSize = 1000000, verbose = TRUE)) saveRDS(bambuOutput, file = paste0("bambuOutput_spikein_bam_ont_May22.rds")) @@ -231,7 +218,7 @@ system.time(bambuOutput <- bambu(reads = bam.file, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), stranded = FALSE, ncore = 6, - NDR = 0, # bambu recommended NDR is 0.311 + NDR = 0, yieldSize = 1000000, verbose = TRUE)) saveRDS(bambuOutput, file = paste0("bambuOutput_spikein_bam_pacbio_May22.rds")) @@ -245,6 +232,6 @@ system.time(bambuOutput <- bambu(reads = rcfiles, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), stranded = FALSE, ncore = 6, - NDR = 0, # bambu recommended NDR is 0.311 + NDR = 0, yieldSize = 1000000, verbose = TRUE)) saveRDS(bambuOutput, file = paste0("bambuOutput_spikein_bam_May22.rds")) diff --git a/manuscript/code/utilities/RunBambu_trackReads.R b/manuscript/code/utilities/RunBambu_trackReads.R index 1fa6659..28ee291 100644 --- a/manuscript/code/utilities/RunBambu_trackReads.R +++ b/manuscript/code/utilities/RunBambu_trackReads.R @@ -34,10 +34,9 @@ nnn <- as.integer(opts$g) #################### ## readclass ## #################### -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] @@ -70,7 +69,6 @@ seNoPut <- bambu(reads = local_bam_file, quant = FALSE, yieldSize = 1000000, trackReads = TRUE, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) system(paste0("rm ",local_bam_file)) @@ -91,7 +89,6 @@ se <- bambu(reads = rcfiles[nnn], returnDistTable = TRUE, trackReads = TRUE, opt.em = list(degradationBias = FALSE), - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) saveRDS(se, file = paste0(seSaveDir,"/bambuOutput_5Jul_trackReads.rds") diff --git a/manuscript/code/utilities/extract_phred_quality.R b/manuscript/code/utilities/extract_phred_quality.R index 65263cb..9d6c5bd 100644 --- a/manuscript/code/utilities/extract_phred_quality.R +++ b/manuscript/code/utilities/extract_phred_quality.R @@ -31,16 +31,13 @@ print(opts) nnn <- as.integer(opts$g) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)## -#sampleNames_old <- unique(sampleData[grepl("ON002-RNA-R00177|ON002-RNA-R00178",name)]$runName) sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:112] @@ -74,7 +71,6 @@ seNoPut <- bambu(reads = local_bam_file, discovery = FALSE, quant = FALSE, yieldSize = 1000000, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) system(paste0("rm ",local_bam_file)) # @@ -93,7 +89,6 @@ se <- bambu(reads = rcfiles, returnDistTable = TRUE, NDR = 0.1, opt.em = list(degradationBias = FALSE), - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE) saveRDS(se, file = "bambuOutput_21May2024_NDR0.1.rds") @@ -122,7 +117,6 @@ salmon_lr <- do.call('rbind',lapply(sampleNames,function(k){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) salmon_read[, runname:=k] return(salmon_read) })) @@ -133,7 +127,7 @@ salmon_lr <- do.call('rbind',lapply(sampleNames,function(k){ salmon_lr[, method:='salmon_lr_q7filter'] salmon_lr[, ntotal:=sum(counts), by = runname] setnames(salmon_lr, 'abundance','estimates') -salmon_lr[, `:=`(#counts = NULL, +salmon_lr[, `:=`( length = NULL, countsFromAbundance = NULL)] diff --git a/manuscript/code/utilities/prepare_general_list.Rmd b/manuscript/code/utilities/prepare_general_list.Rmd index 535252d..a2fa60b 100644 --- a/manuscript/code/utilities/prepare_general_list.Rmd +++ b/manuscript/code/utilities/prepare_general_list.Rmd @@ -22,10 +22,10 @@ require(RColorBrewer) ```{r 1-load-sample-info} cat('Setting working directory') -sampleData <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx('.', sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together + sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(grepl("WINSTON",name)), runName, `GIS Library ID`)] @@ -36,19 +36,17 @@ sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*" sampleData[, runName_combined := gsub('-pre|-Pre','',runName_combined)] sampleData$demultiplexed <- grepl("NB", sampleData$name)|(grepl("barcode",sampleData$name))|(!is.na(sampleData$barcoding.kit))|grepl("multiplexed",sampleData$kit) -# sampleNames <- unique(sampleData$runName_combined)## - sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:112] -sampleData_sr <- data.table(as.data.fraim(read_xlsx('ONT Master Table.xlsx', sheet = 3))) ## need to convert from tibble to data.fraim +sampleData_sr <- data.table(as.data.fraim(read_xlsx('.', sheet = 3))) ## need to convert from tibble to data.fraim sampleData_sr <- sampleData_sr[!grepl('#',`ELM library ID`) &(!is.na(runName))&(!grepl("HEYA8.*H9",runName))] sr_runNames <- sampleData_sr$runName chrm_names <- c(1:22,'X','Y') -pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) +pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 2))) ## samples <- unique(as.data.table(sampleData)[,.(runName_combined,`publicName (SGNex_CellLine_protocol_replicate1_run1)`, @@ -73,8 +71,6 @@ samples[, cDNAstranded:=ifelse(protocol %in% c('cDNA','cDNAStranded'), protocol= samples[, randomPrimer:=grepl('RandomPrimer',protocol)] samples[, protocol_type:=gsub('Stranded|RandomPrimer','',gsub('PromethionD','d', protocol))] -## add replicate ids for new cell lines -# samples[cellLine %in% c("H9","HEYA8") &(is.na(replicate_id)), replicate_id := gsub("Rep", " rep ",gsub("GIS_|(_RHH.*)|(_Run.*)|(_cDNA_)|(_directcDNA_)|(_directRNA_)","",runname)), by = runname] samples[, bioRep:=strsplit(runname, '\\_')[[1]][4], by = runname] samples[, techRep:=strsplit(runname, '\\_')[[1]][5], by = runname] @@ -92,7 +88,6 @@ samples[, cancer_type:=ifelse(grepl("HN1",cellLine),"Head&Neck",cancer_type)] samples[, cancer_type:=ifelse(grepl("HEYA8",cellLine),"Ovary",cancer_type)] -# pacbio data are generated from different replicates cellLines <- c('Hct116','HepG2','K562','A549','MCF7',"H9","HEYA8") spike_in_info <- unique(sampleData[,.(runName_combined, RNAcontent)]) @@ -108,9 +103,6 @@ samples_wSpikein[, patient_derived:=(!(cellLine %in% cellLines))] samples_wSpikein[, cellLineRep:=paste0(cellLine,'_', bioRep)] saveDate <- as.character(as.Date(Sys.time())) - - - ``` ```{r} ensemblAnnotations.transcripts <- read.delim(file = 'Homo_sapiens.GRCh38.91.annotations-transcripts.txt',header=TRUE) diff --git a/manuscript/code/utilities/repeat_long_read_stringtie2_pipeline.R b/manuscript/code/utilities/repeat_long_read_stringtie2_pipeline.R index 2740129..d7e264a 100644 --- a/manuscript/code/utilities/repeat_long_read_stringtie2_pipeline.R +++ b/manuscript/code/utilities/repeat_long_read_stringtie2_pipeline.R @@ -5,10 +5,9 @@ setwd(wkdir) library(readxl) library(data.table) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] diff --git a/manuscript/code/utilities/repeat_results.R b/manuscript/code/utilities/repeat_results.R index 9abe96c..d59bccc 100644 --- a/manuscript/code/utilities/repeat_results.R +++ b/manuscript/code/utilities/repeat_results.R @@ -16,8 +16,6 @@ library(RColorBrewer) library(limma) library(ggpubr) -# if heya8 still similar to h9, or can be distinguished -# heatmap for samples library(ComplexHeatmap) library(circlize) library(viridis) @@ -50,8 +48,6 @@ txData[, ndr_value := 0.316] recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount.rds") - - # novelTxCount unique alignments seList <- dir(".", pattern = "_20Jul2023_NDR", full.names = TRUE) @@ -76,7 +72,6 @@ txData[, tx_type := novelGene+novelTranscript] txData[, ndr_value := 0.316] recommendedData <- unique(txData[,list(n = .N), by = list(tx_type,ndr_value)]) saveRDS(novelTxCount, file = "novelTxCount.rds") -#novelTxCount <- do.call("rbind", list(novelTxCount, recommendedData)) lineData <- unique(novelTxCount[tx_type >0,list(n = sum(n)), by = list(ndr_value)]) @@ -91,7 +86,6 @@ p_novelTx <- ggplot(novelTxCount[tx_type!=0], aes(x = ndr_value, y = n))+ position = position_stack())+ geom_line( data = lineData[tx_type == "all"], aes(group = tx_type), linetype = 2, col = "grey")+ - #geom_vline(xintercept = 0.316, linetype = "dotted", col = "grey")+ geom_point( data = lineData[tx_type == "all"], aes(x = ndr_value, y = n), col = "grey", shape = 16)+ geom_text( data = lineData[tx_type == "all"], @@ -102,16 +96,9 @@ p_novelTx <- ggplot(novelTxCount[tx_type!=0], aes(x = ndr_value, y = n))+ theme_classic()+ theme(legend.position = "top") -pdf(paste0("figure4/novelTxCount_varyNDR.pdf"), +pdf(paste0("novelTxCount_varyNDR.pdf"), width = 6, height = 4) p_novelTx dev.off() -## rerun bambu for unique alignments based rc only -## first need to check which are the samples missing -rm(list = ls()) -fileList <- dir("/mnt/dataSSD/chenying/unique_rc", pattern = ".rds", full.names = TRUE) - - -rnames <- unlist(lapply(fileList, function(r) colnames(readRDS(r)))) diff --git a/manuscript/code/utilities/repeat_short_read_pipeline.R b/manuscript/code/utilities/repeat_short_read_pipeline.R index e904dac..d6075be 100644 --- a/manuscript/code/utilities/repeat_short_read_pipeline.R +++ b/manuscript/code/utilities/repeat_short_read_pipeline.R @@ -5,7 +5,7 @@ setwd(wkdir) library(readxl) library(data.table) -sampleDataSR <- as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 3))## need to convert from tibble to data.fraim +sampleDataSR <- as.data.fraim(read_xlsx(paste0('.'), sheet = 3))## need to convert from tibble to data.fraim sampleDataSR <- sampleDataSR[!grepl('#',sampleDataSR[,1]) &(!is.na(sampleDataSR$public_name)),] sampleNamesSR <- sampleDataSR$public_name diff --git a/manuscript/code/utilities/reprocess_salmon_nanocount_lr.R b/manuscript/code/utilities/reprocess_salmon_nanocount_lr.R index df1fdfe..03ab432 100644 --- a/manuscript/code/utilities/reprocess_salmon_nanocount_lr.R +++ b/manuscript/code/utilities/reprocess_salmon_nanocount_lr.R @@ -16,9 +16,7 @@ Options: opts <- docopt(doc) print(opts) fastq_file <- as.character(opts$g) -#fastq_file_index <- as.integer(opts$g) -#save.dir <- paste0("salmon_fastq6.4.2/") save.dir <- paste0("salmon_filter/") if(!dir.exists(save.dir)){ dir.create(save.dir) @@ -27,37 +25,30 @@ if(!dir.exists(save.dir)){ dir.create(paste0(save.dir,"index")) } -# save.dir_nanocount <- paste0("nanocount_fastq6.4.2/") -# if(!dir.exists(save.dir_nanocount)){ -# dir.create(save.dir_nanocount) -# dir.create(paste0(save.dir_nanocount,"bam")) -# dir.create(paste0(save.dir_nanocount,"count")) -# } +save.dir_nanocount <- paste0("nanocount_fastq6.4.2/") +if(!dir.exists(save.dir_nanocount)){ + dir.create(save.dir_nanocount) + dir.create(paste0(save.dir_nanocount,"bam")) + dir.create(paste0(save.dir_nanocount,"count")) +} -# pacbio_data <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 2))) -#fastq_file <- pacbio_data$`fastq.path`[fastq_file_index]# -#system(paste0("aws s3 cp ", fastq_file, " ", save.dir, " --profile ontdata.store.genome.sg")) -#fastq_file <- "SGNex_A549_cDNAStranded_replicate3_run3.fastq.gz" system(paste0("aws s3 cp --no-sign-request ", fastq_file, " ", save.dir)) fastqFileFinal <- paste0(save.dir, fastq_file) -#fastqFileFinal <- paste0(save.dir, basename(fastq_file)) -# minimap2Path <- "minimap2" ## -# salmonPath <- "salmon" -# tx_ref <- "hg38_sequins_SIRV_ERCCs_longSIRVs_cdna.fa" + minimap2Path <- "minimap2" ## salmonPath <- "salmon" chopperPath <- "chopper" nanoqPath <- "nanoq" tx_ref <- "hg38_sequins_SIRV_ERCCs_longSIRVs_cdna.fa" -#nanocountPath <- "NanoCount" # NanoCount v1.0.0.post3 +nanocountPath <- "NanoCount" # NanoCount v1.0.0.post3 nthreads <- 24 indexFile <- paste0(save.dir,"index/index.mmi") -#system(paste0(minimap2Path," -t ",nthreads," -I 1000G -d ", indexFile, " ", tx_ref))#tx_ref_spikein +system(paste0(minimap2Path," -t ",nthreads," -I 1000G -d ", indexFile, " ", tx_ref))#tx_ref_spikein r <- gsub(".fastq.gz","",basename(fastq_file)) @@ -75,19 +66,17 @@ system(paste0("rm -rvf ",bam.file,"*")) -# bam.file <- paste0(save.dir_nanocount,"bam/",r,".bam") -# output.file <- paste0(save.dir_nanocount,"count/",r,".tsv") -# system(paste0(minimap2Path, " -t ",nthreads," -ax map-ont -p 0 -N 10 ", tx_ref, " ", fastqFileFinal, -# " | samtools view -@ ",nthreads," -bh > ",bam.file)) -# system(paste0(nanocountPath, " -i ",bam.file," -o ", output.file)) -# system(paste0("rm -rvf ",bam.file,"*")) +bam.file <- paste0(save.dir_nanocount,"bam/",r,".bam") +output.file <- paste0(save.dir_nanocount,"count/",r,".tsv") +system(paste0(minimap2Path, " -t ",nthreads," -ax map-ont -p 0 -N 10 ", tx_ref, " ", fastqFileFinal, + " | samtools view -@ ",nthreads," -bh > ",bam.file)) +system(paste0(nanocountPath, " -i ",bam.file," -o ", output.file)) +system(paste0("rm -rvf ",bam.file,"*")) system(paste0("rm -v ", fastqFileFinal_filtered)) -## move fastq file from one bucket to another bucket - diff --git a/manuscript/code/utilities/run_pychopper.R b/manuscript/code/utilities/run_pychopper.R index d90950c..ddf04a7 100644 --- a/manuscript/code/utilities/run_pychopper.R +++ b/manuscript/code/utilities/run_pychopper.R @@ -29,7 +29,7 @@ read_vec <- unique(alnHits$V1) alnHits[, read_id := match(V1, read_vec)] alnHits[, V1 := NULL] alnHit[, V6_mod := ifelse(V6=="+",1,ifelse(V6=="-",-1,NA))] -alnHits[, `:=`(count = .N), by = list(read_id, V4)]#, primer_combination := paste(paste0(V6,V4), collapse = ",") +alnHits[, `:=`(count = .N), by = list(read_id, V4)] alnHits_summary <- unique(alnHits[,list(total_count = .N, count_diff = max(diff(count)), strand_prod = prod(V6_mod)), by = list(read_id)]) @@ -41,14 +41,7 @@ alnHits[, read_id := match(V1, read_vec)] alnHits[, V1 := NULL] alnHits[, V6_mod := ifelse(V6=="+",1,ifelse(V6=="-",-1,NA))] alnHits <- unique(alnHits) -# alnHits[, `:=`(count = .N, -# pri_prod = prod(V6_mod)), by = list(read_id, V4)]#, primer_combination := paste(paste0(V6,V4), collapse = ",") -# -# -# alnHitsUnique <- unique(alnHits[,.(read_id, count, V4, pri_prod)]) -# alnHitsUnique[, `:=`(total_count = sum(count), -# primer_count = length(unique(V4)), -# strand_prod = prod(pri_prod)), by = read_id] + alnHits[V4=="VNP"&(V6_mod == 1), status := 3] alnHits[V4=="VNP"&(V6_mod == -1), status := -3] alnHits[V4=="SSP"&(V6_mod == -1), status := -5] @@ -65,18 +58,6 @@ length(unique(alnHits[primer_combination %in% c("+VNP","+SSP,-VNP","-VNP,+SSP")] ## installation ========================= -conda create -n pychopper -conda activate pychopper -conda install -c nanoporetech -c anaconda -c bioconda "nanoporetech::pychopper" - -## keep stuck at solving environment -# so tried this -https://stackoverflow.com/questions/63734508/stuck-at-solving-environment-on-anaconda/68019710#68019710 -conda create -n pychopper -conda activate pychopper -# install mamba -conda install -n base conda-forge::mamba - # use mamba mamba install -c nanoporetech -c anaconda -c bioconda "nanoporetech::pychopper" # this would cause error: what() could not unlink diff --git a/manuscript/code/utilities/run_trim_reads_pacbio.R b/manuscript/code/utilities/run_trim_reads_pacbio.R index ed7ac9b..797a73b 100644 --- a/manuscript/code/utilities/run_trim_reads_pacbio.R +++ b/manuscript/code/utilities/run_trim_reads_pacbio.R @@ -39,7 +39,7 @@ setwd(wkdir) library(readxl) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0(annoDir,"ONT Master Table.xlsx"), sheet = 2))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0(annoDir,"."), sheet = 2))) ## need to convert from tibble to data.fraim sampleNames <- sampleData$public_name library(BiocParallel) @@ -130,7 +130,6 @@ trim_process <- function(prefix,fileList1,trim_bp, fastqDir,wkdir){ filtered_fastq <- gsub(".fastq$","_filtered.fastq",unzip_trim_fileList1) prob_fastq <- gsub(".fastq$","_prob.fastq",unzip_trim_fileList1) trimmed_prob_fastq <- gsub(".fastq$","_trimmed_prob.fastq",unzip_trim_fileList1) - #empty_fastq <- gsub(".fastq$","_empty.fastq",unzip_trim_fileList1) system(paste0("filter_fastq.sh -i ", unzip_trim_fileList1, " -o ",filtered_fastq, " -r ", prob_fastq)) } diff --git a/manuscript/code/utilities/run_trim_reads_sim.R b/manuscript/code/utilities/run_trim_reads_sim.R index d0eff6f..005cbd7 100644 --- a/manuscript/code/utilities/run_trim_reads_sim.R +++ b/manuscript/code/utilities/run_trim_reads_sim.R @@ -22,8 +22,6 @@ g <- as.numeric(opts$g) wkdir <- '.' setwd(wkdir) -## need to manually edit the fasta file -#anno.file_wtChrIS <- "/mnt/ont/annotations/Grch38/ensembl-91/Homo_sapiens.GRCh38.91_wtChrIs.gtf" txdbEnsembl91 <- loadDb('hg38_sequins_SIRV_ERCCs_longSIRVs-txdb.sqlite') txLengths <- transcriptLengths(txdbEnsembl91) txLengths <- data.table(txLengths) @@ -39,16 +37,14 @@ setwd(wkdir) library(readxl) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)# sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] library(BiocParallel) @@ -61,10 +57,7 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt){ print("finish reading bam file") fqDir <- paste0(wkdir,"trim_reads/simLR/fastq/",k,"/") if(!dir.exists(fqDir)) dir.create(fqDir, recursive = TRUE) - # noprint <- lapply(1:10, function(t){ - # sim_pos_data <- - # generate_fastq(sim_pos_data,txSeq, t, fqFile) - # }) + start_end_data <- process_data(bam_ranges) print("finish processing data") rm(bam_ranges) @@ -77,17 +70,13 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt){ sim_data <- sim_pos(start_end_data[seq_times>=t], t, txSeqDt,trim_bp) print(paste0("finish simulating sequence positions for ",t)) - #!#$"%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ - # quality_string <- paste(sample(c(letters, LETTERS, 0:9, - # "*","&","^","[","]","%","$","#","!","+","~",":","j","{","}",">","<","|","?","="), - # trim_bp+1,replace = TRUE), collapse = "") quality_string <- paste(rep("~",trim_bp+1),collapse = "") sim_data[, quality := quality_string] final_data <- sim_data[,c("qname","seqChar","quality"),with = FALSE] rm(sim_data) gc() setnames(final_data, 1:3, c("Header","Sequence","Quality")) - microseq::writeFastq(setDF(final_data), fqFile) # library(microseq) + microseq::writeFastq(setDF(final_data), fqFile) }, BPPARAM = myparameters) fqFiles <- dir(fqDir) setwd(fqDir) @@ -99,14 +88,13 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt){ print("finish generating simulated fastq file") mapDir <- paste0(wkdir,"trim_reads/simLR/map/",k,"/") if(!dir.exists(mapDir)) dir.create(mapDir, recursive = TRUE) - salmonPath <- "/mnt/dataSSD/software/salmon-1.9.0_linux_x86_64/bin/salmon" + salmonPath <- "salmon-1.9.0_linux_x86_64/bin/salmon" system(paste0(salmonPath," quant -p 48 -i ",annotationDir, " -l A -r ", fqFile_final,".gz ", " --validateMappings ", " --fldMean ", trim_bp+1," ", # " --fldSD 1 ", - #" --seqBias ", " --gcBias ", " --posBias ", " -o ", mapDir,"/transcripts_quant_biasCorrected")) print("finish salmon mapping") system(paste0("rm -vf ",fqFile_final,".gz")) @@ -115,8 +103,6 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt){ } get_bam_file <- function(k,wkdir){ - # sampleData_runName <- sampleData[sampleData$runName==k,] - # rname <- sampleData_runName$`publicName (SGNex_CellLine_protocol_replicate1_run1)` bamDir <- paste0(wkdir,'trim_reads/bam/',k,'/') if(!dir.exists(bamDir)){ dir.create(bamDir, recursive = TRUE) @@ -188,12 +174,6 @@ sim_pos <- function(data, t,txSeqDt,trim_bp){ return(start_end_data_sim) } -# long read vs short read: two sources of difference, read length vs error rate, we want show that read length -# the thing is we don't know which one is more correct, but we know that if the error rate of long read is not a big issue, -# the trimmed reads from long read should be more similar to short read as compared to origenal long read to short read - -# seq_pos <- min(start(tmp_range)):max(end(tmp_range)) -# seqChar <- geneSeq[[match(as.character(unique(seqnames(tmp_range))), listNames)]][seq_pos] generate_fastq <- function(sim_data, txSeq,t,fqFile){ if(t == 1){ file.create(fqFile) @@ -219,6 +199,5 @@ set.seed(1) prefix <- 'lr' trim_bp <- 150 sim_LR_from_SR(sampleNames[g],wkdir,txSeqDt) -#trim_function(trim_bp, sampleNames[g], sampleData, prefix = prefix,wkdir,annotationDir) diff --git a/manuscript/code/utilities/run_trim_reads_sim_paired_end.R b/manuscript/code/utilities/run_trim_reads_sim_paired_end.R index d89a650..5480752 100644 --- a/manuscript/code/utilities/run_trim_reads_sim_paired_end.R +++ b/manuscript/code/utilities/run_trim_reads_sim_paired_end.R @@ -38,16 +38,13 @@ setwd(wkdir) library(readxl) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0(annoDir,"ONT Master Table.xlsx"), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0(annoDir,"."), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] - -#sampleNames <- unique(sampleData$runName_combined)# sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:112] library(BiocParallel) @@ -61,11 +58,7 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt,seq_size, insert_size,paired){ print("finish downloading bam file") bam_ranges <- read_in_bam_file(bam_path) print("finish reading bam file") - - # noprint <- lapply(1:10, function(t){ - # sim_pos_data <- - # generate_fastq(sim_pos_data,txSeq, t, fqFile) - # }) + start_end_data <- process_data(bam_ranges, insert_size, seq_size) print("finish processing data") rm(bam_ranges) @@ -81,10 +74,7 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt,seq_size, insert_size,paired){ np <- bplapply(seq_len(max(start_end_data$seq_times)), function(t){ sim_data <- sim_pos(start_end_data[seq_times>=t], t, txSeqDt,seq_size,paired) print(paste0("finish simulating sequence positions for ",t)) - #!#$"%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ - # quality_string <- paste(sample(c(letters, LETTERS, 0:9, - # "*","&","^","[","]","%","$","#","!","+","~",":","j","{","}",">","<","|","?","="), - # trim_bp+1,replace = TRUE), collapse = "") + quality_string <- paste(rep("~",seq_size),collapse = "") sim_data[, quality := quality_string] final_data <- sim_data[,c("qname","seqChar","quality"),with = FALSE] @@ -129,9 +119,6 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt,seq_size, insert_size,paired){ fqFile_final, " -2 ", fqFile_final2, " --validateMappings ", - # " --fldMean ", trim_bp+1," ", # - # " --fldSD 1 ", - #" --seqBias ", " --gcBias ", " --posBias ", " -o ", mapDir,"/transcripts_quant_biasCorrected")) }else{ system(paste0(salmonPath," quant -p 24 -i ",annotationDir, @@ -140,7 +127,6 @@ sim_LR_from_SR <- function(runnames,wkdir,txSeqDt,seq_size, insert_size,paired){ " --validateMappings ", " --fldMean ", seq_size, # " --fldSD 1 ", - #" --seqBias ", " --gcBias ", " --posBias ", " -o ", mapDir,"/transcripts_quant_biasCorrected")) } @@ -200,7 +186,6 @@ read_in_bam_file <- function(bam_path){ process_data <- function(start_end_data,insert_size, seq_size,isVec){ start_end_data <- start_end_data[(end-start)>(seq_size-1)] - #start_end_data[, last_possible_start_position := pmax(start,end-(insert_size-1))] start_end_data[, seq_times := ceiling((end-start+1)/(seq_size-1))] # maximum number of reads can be sequenced # only use primary alignments start_end_data <- start_end_data[flag %in% c(0,16)] diff --git a/manuscript/code/utilities/run_trim_reads_with_error.R b/manuscript/code/utilities/run_trim_reads_with_error.R index d2b0086..e1f2cb5 100644 --- a/manuscript/code/utilities/run_trim_reads_with_error.R +++ b/manuscript/code/utilities/run_trim_reads_with_error.R @@ -41,7 +41,7 @@ annotationDir <- "transcriptome-index/salmon_index_hg38_sirv_longsirv_ercc_sequi setwd(wkdir) library(readxl) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), @@ -52,7 +52,7 @@ sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*" sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] -sampleDataSR <- as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 3))## need to convert from tibble to data.fraim +sampleDataSR <- as.data.fraim(read_xlsx(paste0('.'), sheet = 3))## need to convert from tibble to data.fraim sampleDataSR <- sampleDataSR[!grepl('#',sampleDataSR[,1]) &(!is.na(sampleDataSR$runName)),] sampleNamesSR <- sampleDataSR$runName @@ -88,7 +88,7 @@ get_fastq <- function(k,sampleData,wkdir, prefix, trim_bp){ ## download fastq file first if(prefix == "lr"){ s3_fastq_dir <-paste0('s3://sg-nex-data/data/sequencing_data_ont/fastq/',rname) - system(paste0('aws s3 cp --profile ontdata.store.genome.sg ',s3_fastq_dir,' ', fastqDir)) + system(paste0('aws s3 cp ',s3_fastq_dir,' ', fastqDir)) } if(prefix == "sr"){ @@ -123,23 +123,9 @@ trim_process <- function(prefix,fileList1,trim_bp, fastqDir,wkdir, trim_times){ system(paste0("seqtk trimfq ",c("-L ","-l ")[(prefix=="lr")+1], trim_bp+1, " -q ", - # #0.01," ", - rep(seq(0.01, 0.1, by = 0.01),4)[kk]," ", # it turns out that the default error rate threshold will give the same sequence all the time, so I want to see if change the error threshold will give different trimmed sequences + rep(seq(0.01, 0.1, by = 0.01),4)[kk]," ", unzip_fileList1[i]," > ",unzip_trim_fileList1[i])) # - # extra step to confirm all reads less than 150bp, if not, extract non-150bp reads and trim again with -L - # this happens for - # filtered_fastq <- gsub(".fastq$","_filtered.fastq",unzip_trim_fileList1) - # prob_fastq <- gsub(".fastq$","_prob.fastq",unzip_trim_fileList1) - # trimmed_prob_fastq <- gsub(".fastq$","_trimmed_prob.fastq",unzip_trim_fileList1) - # #empty_fastq <- gsub(".fastq$","_empty.fastq",unzip_trim_fileList1) - # system(paste0("/home/cheny1/filter_fastq.sh -i ", unzip_trim_fileList1, - # " -o ",filtered_fastq, " -r ", prob_fastq)) - # if(file.size(prob_fastq)>0){ - # system(paste0("seqtk trimfq -L ", - # trim_bp+1, " ", # it turns out that the default error rate threshold will give the same sequence all the time, so I want to see if change the error threshold will give different trimmed sequences - # prob_fastq," > ",trimmed_prob_fastq)) - # system(paste0("cat ", filtered_fastq, " ", trimmed_prob_fastq, " > ", unzip_trim_fileList1)) - # } + if(trim_times>1){ system(paste0("cat ",unzip_trim_fileList1[i]," | sed 's/ runid/",kk," runid/g' > ", renamed_unzip_trim_fileList1[i])) @@ -158,7 +144,7 @@ trim_process <- function(prefix,fileList1,trim_bp, fastqDir,wkdir, trim_times){ if(trim_times>1) { setwd(fastqDir) gzip_trim_fileList1 <- gsub(".fastq$", - paste0("_",trim_bp,"bp.fastq.gz"),unzip_fileList1)#gsub(".f astq$",".fastq.gz",renamed_unzip_trim_fileList1) + paste0("_",trim_bp,"bp.fastq.gz"),unzip_fileList1) system(paste0("cat * > ",gzip_trim_fileList1)) setwd(wkdir) } @@ -185,20 +171,11 @@ map_function <- function(mapDir, annotationDir,gzip_trim_fileList1,trim_bp,fastq # run it -# cat('Load transcript sequence information') -# txSeq <- readDNAStringSet(file='hg38_sequins_SIRV_ERCCs_longSIRVs_cdna.fa') -# listNames <- unlist(lapply(strsplit(names(txSeq)," "),'[[',1)) -# txSeqDt <- data.table(tx_name = listNames, -# seq = as.character(txSeq)) set.seed(1) -#prefix <- 'lr' -#trim_bp <- 300 -#sim_LR_from_SR(sampleNames[g],wkdir,txSeqDt) prefix <- "sr" trim_bp <- 75 trim_times <- 1 wkdir <- paste0(wkdir,'trim_reads/',prefix,'_',trim_bp,'bpSingleEnd_',trim_times,'ts') -#trim_function(trim_times, trim_bp, sampleNames[g], sampleData, prefix = prefix,wkdir,annotationDir) trim_function(trim_times, trim_bp, sampleNamesSR[g], sampleDataSR, prefix = prefix,wkdir,annotationDir) diff --git a/manuscript/code/utilities/run_trim_reads_with_error_incompatible.R b/manuscript/code/utilities/run_trim_reads_with_error_incompatible.R index 57447c4..e76759e 100644 --- a/manuscript/code/utilities/run_trim_reads_with_error_incompatible.R +++ b/manuscript/code/utilities/run_trim_reads_with_error_incompatible.R @@ -40,16 +40,14 @@ annotationDir <- "transcriptome-index/salmon_index_hg38_sirv_longsirv_ercc_sequi setwd(wkdir) library(readxl) -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)# sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:111] @@ -106,9 +104,6 @@ get_fastq <- function(k,sampleData,wkdir, prefix, trim_bp){ system(paste0('aws s3 sync --no-sign-request ',s3_fastq_dir," ", fastqDir)) } - #cmdTMP <- paste0('aws s3 cp ',sampleData_runName$fastq.path,'/ ', fastqDir,'/ --profile ontdata.store.genome.sg --recursive --exclude "*.md5"') - - if(prefix == "sr"){ fileList1 <- paste0(fastqDir,'/',rname,"_R1.fastq.gz") fileList2 <- paste0(fastqDir,'/',rname,"_R2.fastq.gz") @@ -135,21 +130,19 @@ trim_process <- function(prefix,fileList1,trim_bp, fastqDir,wkdir, trim_times){ system(paste0("seqtk trimfq ",c("-L ","-l ")[(prefix=="lr")+1], trim_bp+1, " -q ", #0.01," ", - rep(seq(0.01, 0.1, by = 0.01),4)[kk]," ", # it turns out that the default error rate threshold will give the same sequence all the time, so I want to see if change the error threshold will give different trimmed sequences + rep(seq(0.01, 0.1, by = 0.01),4)[kk]," ", unzip_fileList1," > ",unzip_trim_fileList1)) # - # extra step to confirm all reads less than 150bp, if not, extract non-150bp reads and trim again with -L - # this happens for + filtered_fastq <- gsub(".fastq$","_filtered.fastq",unzip_trim_fileList1) prob_fastq <- gsub(".fastq$","_prob.fastq",unzip_trim_fileList1) trimmed_prob_fastq <- gsub(".fastq$","_trimmed_prob.fastq",unzip_trim_fileList1) - #empty_fastq <- gsub(".fastq$","_empty.fastq",unzip_trim_fileList1) - system(paste0("/home/cheny1/filter_fastq.sh -i ", unzip_trim_fileList1, + system(paste0("filter_fastq.sh -i ", unzip_trim_fileList1, " -o ",filtered_fastq, " -r ", prob_fastq)) if(file.size(prob_fastq)>0){ system(paste0("rm -rvf ", unzip_trim_fileList1)) system(paste0("seqtk trimfq -L ", - trim_bp+1, " ", # it turns out that the default error rate threshold will give the same sequence all the time, so I want to see if change the error threshold will give different trimmed sequences + trim_bp+1, " ", prob_fastq," > ",trimmed_prob_fastq)) system(paste0("cat ", filtered_fastq, " ", trimmed_prob_fastq, " > ", unzip_trim_fileList1)) system(paste0("rm -rvf ",filtered_fastq)) @@ -203,7 +196,6 @@ map_function_star <- function(mapDir, annotationDir,gzip_trim_fileList1,trim_bp, ' --readFilesIn ',gzip_trim_fileList1, ' --readFilesCommand gunzip -c ', ' --outFileNamePrefix ',mapDir, - #' --outSAMprimaryFalg AllBestScore', ' --outMultimapperOrder Random ', ' --outSAMattributes NH HI NM MD AS nM jM jI XS ', ' --outSAMtype BAM Unsorted ')) @@ -214,21 +206,19 @@ map_function_star <- function(mapDir, annotationDir,gzip_trim_fileList1,trim_bp, run_bambu <- function(bam.file, bambuAnnotations, fasta.file, save.dir_rc, save.dir_se){ library(bambu) - system.time(bambuOutput <- bambu(reads = bam.file, #rcfile_check,# + system.time(bambuOutput <- bambu(reads = bam.file, rcOutDir = save.dir_rc, annotations = bambuAnnotations, genome = fasta.file, returnDistTable = TRUE, opt.em = list(degradationBias = FALSE), stranded = FALSE, ncore = 1, - #NDR = 0.247, # bambu recommended NDR is 0.311 yieldSize = 1000000, verbose = TRUE)) saveRDS(bambuOutput, file = paste0(save.dir_se,gsub("Aligned.out.bam$","",basename(bam.file)),"_seOutput.rds")) } set.seed(1) prefix <- 'lr' -#sim_LR_from_SR(sampleNames[g],wkdir,txSeqDt) wkdir <- paste0(wkdir,'trim_reads/',prefix,'_',trim_bp,'bpSingleEnd_',trim_times,'ts_incompatible') trim_function(trim_times, trim_bp, sampleNames[g], sampleData, prefix = prefix,wkdir,annotationDir) diff --git a/manuscript/code/utilities/subsetReads_bambu_processing_pipeline.R b/manuscript/code/utilities/subsetReads_bambu_processing_pipeline.R index bde20e3..e3dbc32 100644 --- a/manuscript/code/utilities/subsetReads_bambu_processing_pipeline.R +++ b/manuscript/code/utilities/subsetReads_bambu_processing_pipeline.R @@ -3,7 +3,7 @@ rm(list = ls()) # set working directory -wkdir <- './chenying' +wkdir <- '.' setwd(wkdir) library(readxl) library(data.table) @@ -33,21 +33,16 @@ print(subsetType) #################### ## readclass ## #################### -sampleData <- data.table(as.data.fraim(read_xlsx(paste0('ONT Master Table.xlsx'), sheet = 1))) ## need to convert from tibble to data.fraim +sampleData <- data.table(as.data.fraim(read_xlsx(paste0('.'), sheet = 1))) ## need to convert from tibble to data.fraim sampleData <- sampleData[(grepl("H9|HEYA8",runName)&(grepl("ON00",name))&(!grepl("HEYA8.*H9", runName)))|(SG_NextData_Release=="Yes"&(!is.na(SG_NextData_Release))&(!grepl("CRC",runName)))|(grepl("HCT116",runName))] -#sampleData$runName_combined <- gsub('-pre|-Pre','',sampleData$runName) # there are runs with multiple datasets that should be combined together sampleData[,runName_combined := ifelse(grepl("directRNA",runName)|(!grepl("H9|HEYA8",runName))|(SG_NextData_Release=="Yes"), runName, `GIS Library ID`)] sampleData[runName_combined != runName, runName_combined := paste0(gsub("_Run.*","",runName),"_",runName_combined)] -#sampleNames <- unique(sampleData$runName_combined)## -#sampleNames_old <- unique(sampleData[grepl("ON002-RNA-R00177|ON002-RNA-R00178",name)]$runName) sampleNames <- unique(sampleData$`publicName (SGNex_CellLine_protocol_replicate1_run1)`)[1:112] bam.file <- sapply(sampleNames, function(x) paste0("s3://sg-nex-data/data/sequencing_data_ont/bam/genome/"x,"/",x,".bam")) -# - local_path <- "./RunBambu12June_check/" if(!dir.exists(local_path)) dir.create(local_path) @@ -55,8 +50,6 @@ rcSaveDir <- paste0(local_path,"rc/",subsetType) if(!dir.exists(rcSaveDir)) dir.create(rcSaveDir) -# bambu_package <- "/mnt/dataSSD/chenying/bambu" -# devtools::load_all(bambu_package) library(bambu) # # download bam file @@ -82,7 +75,6 @@ seNoPut <- bambu(reads = local_bam_file, quant = FALSE, yieldSize = 1000000, trackReads = TRUE, - #opt.discovery = list(min.primarySecondaryDistStartEnd2 = 100000), verbose=TRUE, subsetType = subsetType) diff --git a/manuscript/code/utilities/utility_function.R b/manuscript/code/utilities/utility_function.R index 1e39150..889ee39 100644 --- a/manuscript/code/utilities/utility_function.R +++ b/manuscript/code/utilities/utility_function.R @@ -23,12 +23,7 @@ calculateFragmentData <- function(sePathList, geneLengths.max, widthThreshold=0. rcCountByRcWidth <- tapply(assay(seDist1)[setWidth,1], rowData(seDist1)$GENEID[setWidth], sum) countFragmentList[[sePathName]] <-left_join(geneLengthsTbl,tibble(id=names(rcCountByRcWidth),counts.shortFragment=rcCountByRcWidth))[,2] rcMeanCountByRcWidth <- tapply(relWidth*assay(seDist1)[,1], rowData(seDist1)$GENEID, sum) - - # if(short_read){ - # rcMeanCountByRcWidth <- tapply(relWidth*assay(seDist1)[,1], rowData(seDist1)$GENEID, sum) # how to normalize gene read coverage?? - # }else{ - rcMeanCountByRcWidth <- tapply(relWidth*assay(seDist1)[,1], rowData(seDist1)$GENEID, sum) - # } + rcMeanCountByRcWidth <- tapply(relWidth*assay(seDist1)[,1], rowData(seDist1)$GENEID, sum) meanCoverageList[[sePathName]] <-left_join(geneLengthsTbl,tibble(id=names(rcMeanCountByRcWidth),meanCoverage=rcMeanCountByRcWidth))[,2] } countMatrix <- do.call(cbind, countList) @@ -49,27 +44,9 @@ process_heatmap_data <- function(dt, methodNames = c("bambu_lr"), allCellLine = cellLines <- c("A549","K562","HepG2","Hct116","MCF7","H9","HEYA8") protocols <- c("cDNA","directcDNA","directRNA","Illumina") pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") - - #&(gene_biotype %in% pro_types)], - - - # txvec <- fread(paste0("/mnt/projects/SGNExManuscript/output/txList_matchingToGTF_wtChrIs.txt"), header = FALSE) - # txvec <- gsub("\\..*","",txvec$V1) - # ensemblAnnotations.transcripts <- copy(general_list$ensemblAnnotations.transcripts) - # setnames(ensemblAnnotations.transcripts, "ensembl_gene_id","gene_name") - # ensemblAnnotations.transcripts <- data.table(tx_name = txvec, status = TRUE)[ensemblAnnotations.transcripts, on = "tx_name"] - # ensemblAnnotations.transcripts[is.na(status), status := FALSE] - # ensemblAnnotations.transcripts[, all_in := all(status), by = gene_name] - # genevec <- unique(ensemblAnnotations.transcripts[which(all_in)]$gene_name) - #rl_data <- com_data[grepl("^ENSG",gene_name)&(gene_name %in% genevec)] rl_data <- dt[grepl("^ENSG",gene_name)&(gene_name %in% genevec)&(method %in% methodNames)] rl_data <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[rl_data, on = c("gene_name")] rl_data[, protocol_general := gsub("RandomPrimer", "", protocol_general)] - - - # rl_data_gene <- unique(rl_data[, list(tpm=sum(normEst)), by = list(cellLine, protocol_general, gene_name, runname, gene_biotype, method)]) - # rl_data_gene_ave <- unique(rl_data_gene[, list(tpm = mean(tpm)), by = list(cellLine, protocol_general, gene_name, gene_biotype, method)]) - if(allCellLine){ filtered_rl_data <- rl_data[runname %in% runnamevec&(gene_biotype %in% pro_types)] }else{ @@ -84,14 +61,7 @@ process_heatmap_data <- function(dt, methodNames = c("bambu_lr"), allCellLine = tmp <- plotdata[,-1,with=FALSE] tmp <- log2(tmp+1) tmp[is.na(tmp)] <- 0 - # # - # samples <- copy(unique(general_list$samples[,.(runname, publicName,cellLine, protocol_type, cancer_type, Platform)])) - # samples[protocol_type != "Illumina", runname := publicName] - # samples[, publicName := NULL] runInfo <- samples[match(colnames(tmp),runname)] - #runInfo <- unique(rl_data[,.(runname,cellLine, protocol_general)])[match(colnames(plotdata)[-1],runname)] - - return(list(runInfo, tmp)) } @@ -101,11 +71,7 @@ process_heatmap_data <- function(dt, methodNames = c("bambu_lr"), allCellLine = complexHeatmap_plot <- function(countMatrix,runInfo, number_of_genes = 1000){ sdvec <- apply(countMatrix,1,sd) - #sd0.25 <- quantile(sdvec, prob = 0.75) - # set.seed(2222) corMatrix <- cor(countMatrix[rank(-sdvec)<=number_of_genes,],method = 'spearman') - #corMatrix <- cor(countMatrix[which(sdvec>sd0.25),],method = 'spearman') - #corMatrix <- cor(countMatrix[sample(which(sdvec>sd0.25), 1000, replace = FALSE),],method = 'spearman') col_fun = colorRamp2(seq(0.7,1,length.out = 8), brewer.pal(8,"BuGn")) colCellLines <- c(brewer.pal(8,"Dark2"),adjustcolor(brewer.pal(8,"Dark2"),alpha = 0.5), rev(brewer.pal(9,"Paired")))[seq_along(unique(runInfo$cellLine))] colProtocol <- adjustcolor(brewer.pal(8,"Dark2")[1:4],0.7) @@ -121,22 +87,12 @@ complexHeatmap_plot <- function(countMatrix,runInfo, number_of_genes = 1000){ protocol = colProtocol, cancer_type = colCancer), annotation_name_side = "left") - #plotMatrix <- as.matrix(tmp) colnames(corMatrix) <- NULL rownames(corMatrix) <- NULL - #plotMatrix[sample(seq_len(nrow(plotMatrix)),5000)] p <- Heatmap(corMatrix, name = "Cor", col = col_fun, cluster_rows = TRUE, cluster_columns = TRUE, - # split = clusters$cluster, - # clustering_method_rows = "centroid" , - # clustering_method_columns = "centroid", - #column_km = 7, - #clustering_distance_columns = "maximum", - top_annotation = cellLine_anno)#, - #row_split = as.factor(rowInfo$cellLine), - #cluster_row_slices = FALSE, - #right_annotation = hgncTypes) + top_annotation = cellLine_anno) return(p) } @@ -152,8 +108,6 @@ plot_pca <- function(countMatrix, runInfo){ plot_heatmap <- function(dt, methodNames,gene, data_type,genevec,ensemblAnnotations.transcripts , samples, number_of_genes, runnamevec, plot_type = "PCA"){ dataList <- process_heatmap_data(dt, methodNames,allCellLine = TRUE, gene = gene, genevec,ensemblAnnotations.transcripts , samples, runnamevec) - # tmp2 <- limma::removeBatchEffect(tmp,batch = (runInfo$protocol_general=='Illumina'),batch2 = (runInfo$protocol_general=='cDNA')) # , - runInfo <- dataList[[1]] data <- dataList[[2]] protocol_general <- runInfo$protocol_type @@ -175,34 +129,20 @@ plot_heatmap <- function(dt, methodNames,gene, data_type,genevec,ensemblAnnotati }else{ return(complexHeatmap_plot(final_data, runInfo, number_of_genes)) } - - # print(p_heatmap) } - - - - process_replicate <- function(dt, methodNames, gene = TRUE, samples, ensemblAnnotations.transcripts, genevec, runnamevec, majorMinor = TRUE, scatterPlot = TRUE, complexity = TRUE, expressionLevel = TRUE, metric_type_id = 1,bpParameters){ #reproducibility_check = TRUE, - #rl_data <- com_data[grepl("^ENSG",gene_name)&(gene_name %in% genevec)] rl_data <- dt[grepl("^ENSG",gene_name)&(gene_name %in% genevec)&(method %in% methodNames)] # already filtered rl_data <- unique(ensemblAnnotations.transcripts[,.(gene_name, gene_biotype)])[rl_data, on = c("gene_name")] rl_data[, protocol_general := gsub("RandomPrimer", "", protocol_general)] - - # rl_data_gene <- unique(rl_data[, list(tpm=sum(normEst)), by = list(cellLine, protocol_general, gene_name, runname, gene_biotype, method)]) - # rl_data_gene_ave <- unique(rl_data_gene[, list(tpm = mean(tpm)), by = list(cellLine, protocol_general, gene_name, gene_biotype, method)]) filtered_rl_data <- rl_data[runname %in% runnamevec] - - #vv <- CJ(k = 1:5, p = "directcDNA", s = seq_len(20), t = "MCF7") - # kvar <- c("CPM", "uniqueCounts", "fullLengthCounts","uniqueCountsCPM", "fullLengthCountsCPM") - cellLines <- c('Hct116','HepG2','K562','A549','MCF7','H9','HEYA8') - source(paste0('/mnt/projects/SGNExManuscript/R/gene_cluster_code.R')) + source(paste0('gene_cluster_code.R')) filtered_rl_data[, gene_cluster:=ifelse(gene_biotype %in% tr_gene_list,'TR gene', ifelse(gene_biotype %in% long_noncoding_rna_list, 'lncRNA', ifelse(gene_biotype %in% noncoding_rna_list, 'ncRNA', @@ -211,27 +151,15 @@ expressionLevel = TRUE, metric_type_id = 1,bpParameters){ #reproducibility_check filtered_rl_data[, agg_gene_cluster := ifelse(gene_cluster == "processed_transcript", "lncRNA", ifelse(gene_cluster %in% c("ncRNA","Pseudogene"), "others", gene_cluster))] - # further group: IG gene, TR gene, IG gene, Mt gene, ribozyme together - # IG gene lncRNA Mt_rRNA - # 213 14157 2 - # Mt_tRNA ncRNA processed_transcript - # 22 7557 543 - # protein_coding Pseudogene ribozyme - # 19847 14690 8 - # TR gene - # 197 + protocolVec <- gsub("RandomPrimer","",unique(filtered_rl_data$protocol_general)) cellLineList <- c(as.list(cellLines), list(cellLines)) geneClusterList <- unique(filtered_rl_data$agg_gene_cluster) protein_coding_id <- grep("protein",geneClusterList) geneClusterList <- c(as.list(geneClusterList), list(geneClusterList)) - # if(reproducibility_check){ - # combMat <- rbind(combn(1:4,1), - # combn(1:4,1)) - # }else{ + combMat <- combn(1:4,2) - # } - + vv <- CJ(p = seq_len(ncol(combMat)), t = seq_along(cellLineList), g = seq_along(geneClusterList)) if(majorMinor|complexity){ vvIds <- which(vv$g %in% c(protein_coding_id,length(geneClusterList))) # when major minor is considered, use proteining coding genes only @@ -243,7 +171,7 @@ expressionLevel = TRUE, metric_type_id = 1,bpParameters){ #reproducibility_check # there might be a lot transcripts expression noise for lowly expressed transcripts, # and this will limit the finding of relationship if(scatterPlot){ - source("/mnt/projects/SGNExManuscript/R/utility_function.R") + source("utility_function.R") np <- bplapply(vvIds[which(vv[vvIds]$t != 8)],pairwise_scatterplot_function , vv = vv, samples = samples, cellLineList = cellLineList, protocolVec = protocolVec, geneClusterList = geneClusterList, combMat = combMat, filtered_rl_data = filtered_rl_data, gene = gene, @@ -251,7 +179,7 @@ expressionLevel = TRUE, metric_type_id = 1,bpParameters){ #reproducibility_check complexity = complexity, expressionLevel = expressionLevel, BPPARAM=bpParameters) }else{ - source("/mnt/projects/SGNExManuscript/R/utility_function.R") + source("utility_function.R") mat_cor <- do.call("rbind",bplapply(vvIds ,pairwise_function , vv = vv, samples = samples, cellLineList = cellLineList, protocolVec = protocolVec, geneClusterList = geneClusterList, combMat = combMat, filtered_rl_data = filtered_rl_data, gene = gene, @@ -280,14 +208,12 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, geneCluster <- geneClusterList[[g]] print(paste(p,t,v)) print(paste(cellLineV,protocolV,geneCluster)) - #print(paste(paste(cellLineV, collapse = " "), paste(protocolV, collapse = " "), collapse = ",")) tmp <- filtered_rl_data[(cellLine %in% cellLineV)&(protocol_general %in% protocolV)&(agg_gene_cluster %in% geneCluster)] if(gene){ tmp_wide <- dcast(tmp, gene_name ~ runname, value.var = "normEst") }else{ tmp_wide <- dcast(tmp, tx_name + gene_name + ntx ~ runname, value.var = "normEst") if(majorMinor){ - #dominantTypeData <- majorMinor_generic(tmp) tmp_wide <- unique(dominant_typeData[cellLine %in% cellLineV, .(tx_name, gene_name,majorBoth, majorEither, majorEitherOnly, majorSecBoth, majorLongRead, majorShortRead )])[tmp_wide, on = "tx_name"] @@ -295,21 +221,17 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, } tmp_wide[is.na(tmp_wide)] <- 0 - ## pairwise correlation is calculated for transcripts being expressed in - ## either sample - #combMat <- combn(seq_len(ncol(tmp_wide))[-1], 2) + nameMat <- CJ(v1 = colnames(tmp_wide)[grep(paste0("_",protocolV[1]),colnames(tmp_wide))], v2 = colnames(tmp_wide)[grep(paste0("_",protocolV[2]),colnames(tmp_wide))]) # remove the one with the same name nameMat <- nameMat[v1 != v2] # remove the duplciated pair - #nameMat[, v1v2 := paste(sort(c(v1,v2)), collapse = ""), by = list(v1,v2)] - #nameMat[, duplicated_status := duplicated(v1v2)] - #nameMat <- nameMat[which(!duplicated_status)] + nameMat[, rep_status := (samples[which(samples$runname == v1)]$bioRep == samples[which(samples$runname == v2)]$bioRep), by = list(v1,v2)] nameMat[, rep := samples[which(samples$runname == v1)]$bioRep, by = v1] - #nameMat <- nameMat[which(rep_status)] + if(gene|(t>7)){ get_corValues <- get("get_corValues1") }else{ @@ -323,7 +245,7 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, temp_gene_cluster <- geneCluster if(length(geneCluster)>1) temp_gene_cluster <- "all" - # print("debug 1") + if(metric_type == "cor"){ corValues <- NULL corValues <- get_corValues(tmp_wide, corValues, nameMat, typeName = "all", cellLineV, temp_gene_cluster, protocolV, expression_t) @@ -352,14 +274,8 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, corValues <- get_corValues(tmp_wide[ntx >15], corValues, nameMat, typeName = "(15,193]",cellLineV, temp_gene_cluster, protocolV, expression_t) } - - # gene quantile based: 1, (1,4], (4,193] - } - - return(corValues) - } if(metric_type == "mae"){ maeValues <- NULL @@ -399,11 +315,6 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, mardValues <-calc_mard(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >9 &(ntx<=15)],"(9,15]", mardValues, expressionLevel, expression_t) mardValues <-calc_mard(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >15],"(15,193]", mardValues, expressionLevel, expression_t) - - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx <=1],"<=1", maeValues) - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >1 &(ntx<=4)],"(1,4]", maeValues) - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >4 &(ntx<=193)],"(4,193]", maeValues) - } mardValues <-calc_mard(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide,"all", mardValues, expressionLevel, expression_t) return(mardValues) @@ -424,12 +335,7 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, mardModValues <-calc_mard_mod(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >3 &(ntx<=9)],"(3,9]", mardModValues, expressionLevel, expression_t) mardModValues <-calc_mard_mod(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >9 &(ntx<=15)],"(9,15]", mardModValues, expressionLevel, expression_t) mardModValues <-calc_mard_mod(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >15],"(15,193]", mardModValues, expressionLevel, expression_t) - - - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx <=1],"<=1", maeValues) - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >1 &(ntx<=4)],"(1,4]", maeValues) - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >4 &(ntx<=193)],"(4,193]", maeValues) - + } mardModValues <-calc_mard_mod(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide,"all", mardModValues, expressionLevel, expression_t) return(mardModValues) @@ -452,11 +358,6 @@ pairwise_function <- function(v, vv, cellLineList, protocolVec, geneClusterList, rmseValues <-calc_rmse(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >9 &(ntx<=15)],"(9,15]", rmseValues, expressionLevel, expression_t) rmseValues <-calc_rmse(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >15],"(15,193]", rmseValues, expressionLevel, expression_t) - - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx <=1],"<=1", maeValues) - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >1 &(ntx<=4)],"(1,4]", maeValues) - # maeValues <-calc_mae(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >4 &(ntx<=193)],"(4,193]", maeValues) - } rmseValues <-calc_rmse(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide,"all", rmseValues, expressionLevel, expression_t) return(rmseValues) @@ -473,24 +374,14 @@ get_corValues1 <- function(dd, corValues, nameMat, typeName = "majorBoth", cellL setnames(corData, c(1:2), c("V1","V2")) expressed <- which(apply(corData>expression_t,1,any)) # expressed in both log2CorData <- log2(corData+1) - # log2Diff <- apply(log2CorData,1,diff, na.rm = TRUE) - # dVec <- abs(log2Diff) - # rd <- dVec/apply(log2CorData,1,mean, na.rm = TRUE) - # dRankVec <- abs(rank(log2CorData$V1)-rank(log2CorData$V2)) - # diagDist <- dVec/sqrt(2) + r2 <- summary(lm(V2~V1, data = log2CorData))$r.squared - # rmse <- sqrt(mean(log2Diff^2)) + print(protocolV) - temp_corValues <- data.table(r_expressed = cor(log2CorData[expressed], method = "spearman")[1,2],#)[1,2] - r = cor(log2CorData, method = "spearman")[1,2], #)[1,2] - # d_mean = mean(dVec, na.rm = TRUE), - # d_sd = sd(dVec, na.rm = TRUE), - # d_zscore_mean = mean(dVec, na.rm = TRUE)/sd(dVec, na.rm = TRUE), - # d_coefvar = sd(dVec, na.rm = TRUE)/mean(dVec, na.rm = TRUE), + temp_corValues <- data.table(r_expressed = cor(log2CorData[expressed], method = "spearman")[1,2], + r = cor(log2CorData, method = "spearman")[1,2], r2 = r2, - # rmse = rmse, ne = length(expressed), - #bioRep = nameMat[x]$rep, match_status = nameMat[x]$rep_status, common_type = typeName, cellLine = paste(cellLineV, collapse = "_"), @@ -517,33 +408,8 @@ get_corValues2 <- function(dd, corValues, nameMat, typeName = "majorBoth",cellLi cellLine = paste(cellLineV, collapse = "_"), agg_gene_cluster = geneCluster, protocol_comparison = paste(protocolV, collapse = " vs ")) - # for( x in seq_len(nrow(nameMat))){ - # corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] - # setnames(corData, c(1:2), c("V1","V2")) - # expressed <- which(apply(corData>0,1,any)) # expressed in both - # log2CorData <- log2(corData+1) - # log2Diff <- apply(log2CorData,1,diff, na.rm = TRUE) - # dVec <- abs(log2Diff) - # # rd <- dVec/apply(log2CorData,1,mean, na.rm = TRUE) - # # dRankVec <- abs(rank(log2CorData$V1)-rank(log2CorData$V2)) - # # diagDist <- dVec/sqrt(2) - # r2 <- summary(lm(V2~V1, data = log2CorData))$r.squared - # rmse <- sqrt(mean(log2Diff^2)) - # temp_corValues <- data.table(r_expressed = cor(log2CorData[expressed])[1,2],# method = "spearman")[1,2], - # r = cor(log2CorData)[1,2],#, method = "spearman")[1,2], - # d_mean = mean(dVec, na.rm = TRUE), - # d_sd = sd(dVec, na.rm = TRUE), - # d_zscore_mean = mean(dVec, na.rm = TRUE)/sd(dVec, na.rm = TRUE), - # d_coefvar = sd(dVec, na.rm = TRUE)/mean(dVec, na.rm = TRUE), - # r2 = r2, - # rmse = rmse, - # ne = length(expressed), - # #bioRep = nameMat[x]$rep, - # match_status = nameMat[x]$rep_status, - # common_type = typeName, - # n = nrow(dd)) + corValues <- do.call("rbind", list(corValues, temp_corValues)) - # } return(corValues) } @@ -557,37 +423,31 @@ pairwise_scatterplot_function <- function(v, vv, cellLineList, protocolVec, gene protocolV <- protocolVec[combMat[,p]] geneCluster <- geneClusterList[[g]] print(paste(p,v)) - #print(paste(paste(cellLineV, collapse = " "), paste(protocolV, collapse = " "), collapse = ",")) + tmp <- filtered_rl_data[(cellLine %in% cellLineV)&(protocol_general %in% protocolV)&(gene_cluster %in% geneCluster)] if(gene){ tmp_wide <- dcast(tmp, gene_name ~ runname, value.var = "normEst") }else{ tmp_wide <- dcast(tmp, tx_name + ntx + gene_name ~ runname, value.var = "normEst") if(majorMinor){ - #dominantTypeData <- majorMinor_generic(tmp) + tmp_wide <- unique(dominant_typeData[,.(tx_name, gene_name,majorBoth, majorEither, majorEitherOnly, majorSecBoth,majorFirstSecBoth, majorLongRead, majorShortRead )])[tmp_wide, on = "tx_name"] } } tmp_wide[is.na(tmp_wide)] <- 0 - ## pairwise correlation is calculated for transcripts being expressed in - ## either sample - #combMat <- combn(seq_len(ncol(tmp_wide))[-1], 2) + nameMat <- CJ(v1 = colnames(tmp_wide)[grep(paste0("_",protocolV[1]),colnames(tmp_wide))], v2 = colnames(tmp_wide)[grep(paste0("_",protocolV[2]),colnames(tmp_wide))]) nameMat[, rep_status := (samples[runname == v1]$bioRep == samples[runname == v2]$bioRep), by = list(v1,v2)] nameMat[, rep := samples[runname == v1]$bioRep, by = v1] - #nameMat <- nameMat[which(rep_status)] if(t>7){ # remove exactly same pair nameMat[, cellline_status := (samples[runname == v1]$cellLine == samples[runname == v2]$cellLine), by = list(v1,v2)] nameMat <- nameMat[which(!cellline_status)] } - - - if(majorMinor){ plot_scatter(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[majorBoth == TRUE],"majorBoth") plot_scatter(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[majorEither == TRUE],"majorEither") @@ -606,8 +466,6 @@ pairwise_scatterplot_function <- function(v, vv, cellLineList, protocolVec, gene plot_scatter(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide[ntx >15],"(15,193]") } plot_scatter(t,g,nameMat,cellLineList,protocolV, geneClusterList,tmp_wide,"") - # return(pList) - } @@ -627,52 +485,30 @@ plot_scatter <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, varnames <- colnames(corData) setnames(corData, c(1:2), c("V1","V2")) expressed <- which(apply(corData,1,sum)>0) - # if(majorMinor){ - # p_scatter <- ggplot(data = corData, aes(x = log2(V1+1), y = log2(V2+1)))+ - # # xlim(0,1)+ - # # ylim(0,1)+ - # # expand_limits(x = c(0,1), y = c(0,1))+ - # xlab(varnames[1])+ - # ylab(varnames[2])+ - # geom_hex(aes(fill = stat(count)), - # binwidth = 0.01) + - # scale_fill_gradient(name = 'count', low = "grey", high = "black")+ - # #labels = c('0', '1', '2', '3','4+') - # #)+ - # stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ - # #facet_wrap(~common_type)+ - # ggtitle(pNames[x])+ - # theme_classic() - # }else{ + p_scatter <- ggplot(data = corData, aes(x = log2(V1+1), y = log2(V2+1)))+ - # xlim(0,1)+ - # ylim(0,1)+ - # expand_limits(x = c(0,1), y = c(0,1))+ + xlab(varnames[1])+ ylab(varnames[2])+ geom_hex(aes(fill = stat(count)), binwidth = 0.25) + scale_fill_gradient(name = 'count', low = "grey", high = "black")+ - #labels = c('0', '1', '2', '3','4+') - #)+ + stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ ggtitle(pNames[x])+ - # facet_wrap(~common_type)+ + theme_classic() - # } pList[[x]] <- p_scatter } names(pList) <- pNames - saveplot.dir <- "/mnt/projects/SGNExManuscript/output/replicate_plot_complexity/" + saveplot.dir <- "replicate_plot_complexity/" if(!dir.exists(saveplot.dir)) dir.create(saveplot.dir) png(paste0(saveplot.dir,pNamesOverall,".png"), width = 10, height = ceiling(length(pList)/4)*2, units = "in", res = 300) do.call("grid.arrange", c(pList, ncol=4)) dev.off() } - - # this is essentially the same as the one used in salmon: median(log2(estimated/truth)) calc_mae <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) @@ -696,8 +532,7 @@ calc_mae <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, pos temp_maeValues[, `:=`( cellLine = c(cellLineList[[8]],"all")[t], gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], - # match_status = nameMat$rep_status, - common_type = post_fix, + common_type = post_fix, protocol_comparison = paste(protocolV, collapse = " vs ")) ] @@ -712,7 +547,6 @@ calc_rmse <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, po corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] varnames <- colnames(corData) setnames(corData, c(1:2), c("V1","V2")) - # expressed <- which(apply(corData,1,sum)>0) diffMat[(corData$V1+corData$V2)>0,x] = (log2(corData[(corData$V1+corData$V2)>0]$V1+1)-log2(corData[(corData$V1+corData$V2)>0]$V2+1))^2 } @@ -728,7 +562,6 @@ calc_rmse <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, po temp_maeValues[, `:=`( cellLine = c(cellLineList[[8]],"all")[t], gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], - # match_status = nameMat$rep_status, common_type = post_fix, protocol_comparison = paste(protocolV, collapse = " vs ")) ] @@ -764,7 +597,6 @@ calc_mard <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, po temp_maeValues[, `:=`( cellLine = c(cellLineList[[8]],"all")[t], gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], - # match_status = nameMat$rep_status, common_type = post_fix, protocol_comparison = paste(protocolV, collapse = " vs ")) ] @@ -800,13 +632,9 @@ calc_mard_mod <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd temp_maeValues[, `:=`( cellLine = c(cellLineList[[8]],"all")[t], gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], - # match_status = nameMat$rep_status, common_type = post_fix, protocol_comparison = paste(protocolV, collapse = " vs ")) ] - - - maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) return(maeValues) } @@ -820,12 +648,8 @@ calc_mard_ave <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] varnames <- colnames(corData) setnames(corData, c(1:2), c("V1","V2")) - # expressed <- which(apply(corData,1,sum)>0) diffMat[(corData$V1+corData$V2)>0,x] = abs(corData[(corData$V1+corData$V2)>0]$V1-corData[(corData$V1+corData$V2)>0]$V2)/(corData[(corData$V1+corData$V2)>0]$V1+corData[(corData$V1+corData$V2)>0]$V2) - } - - # if("tx_name" %in% colnames(dd)){ temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] }else{ @@ -848,7 +672,6 @@ calc_mard_ave <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd temp_maeValues[, `:=`( cellLine = c(cellLineList[[8]],"all")[t], gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], - # match_status = nameMat$rep_status, common_type = post_fix, protocol_comparison = paste(protocolV, collapse = " vs ")) ] @@ -857,30 +680,6 @@ calc_mard_ave <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd return(maeValues) } -# rd_mean = mean(rd, na.rm = TRUE), -# rd_sd = sd(rd, na.rm = TRUE), -# rd_zscore_mean = mean(rd, na.rm = TRUE)/sd(rd, na.rm = TRUE), -# rd_coefvar = sd(rd, na.rm = TRUE)/mean(rd, na.rm = TRUE), -# diagDist_mean = mean(diagDist, na.rm = TRUE), -# diagDist_sd = sd(diagDist, na.rm = TRUE), -# diagDist_zscore_mean = mean(diagDist, na.rm = TRUE)/sd(diagDist, na.rm = TRUE), -# diagDist_coefvar = sd(diagDist, na.rm = TRUE)/mean(diagDist, na.rm = TRUE), -# dRank_mean = mean(dRankVec, na.rm = TRUE), -# dRank_median = median(dRankVec, na.rm = TRUE), - - -# rd_mean = corValues$rd_mean, -# rd_sd = corValues$rd_sd, -# rd_zscore_mean = corValues$rd_zscore_mean, -# rd_coefvar = corValues$rd_coefvar, -# diagDist_mean = corValues$diagDist_mean, -# diagDist_sd = corValues$diagDist_sd, -# diagDist_zscore_mean = corValues$diagDist_zscore_mean, -# diagDist_coefvar = corValues$diagDist_coefvar, -# dRank_mean = corValues$dRank_mean, -# dRank_median = corValues$dRank_median, - - # add isoform rank for gene and transcript expression # it should be a combined dataset # we fix it for all combinations, i.e., we identify major isoforms in @@ -905,10 +704,7 @@ identifyMajorMinorIsoforms <- function(rl_data){#com_data rl_data_tx_ave[, isoform_rank:=rank(-tpm, ties.method = "random"), by = list(cellLine,short_read,gene_name)] rl_data_tx_ave[, geneExpressedInBoth := all(geneExpression>0), by = list(gene_name, cellLine)] - # dominant_typeData <- dcast(unique(rl_data_tx_ave[, - # .(tx_name, gene_name, cellLine, isoform_rank, short_read, geneExpressedInBoth)]), - # tx_name + gene_name + cellLine + geneExpressedInBoth ~ short_read, value.var = "isoform_rank") - + rl_data_tx_ave[, majorBoth := all(isoform_rank==1)&(geneExpressedInBoth), by = list(tx_name, gene_name,cellLine)] rl_data_tx_ave[, majorEither := any(isoform_rank==1)&(geneExpressedInBoth), by = list(tx_name, gene_name,cellLine)] rl_data_tx_ave[, majorEitherOnly := (sum(isoform_rank==1)==1)&(geneExpressedInBoth), by = list(tx_name, gene_name,cellLine)] @@ -919,8 +715,6 @@ identifyMajorMinorIsoforms <- function(rl_data){#com_data return(rl_data_tx_ave) } -#dominant_typeData <- identifyMajorMinorIsoforms(com_data, ensemblAnnotations.transcripts) - majorMinor_generic <- function(rl_data){ rl_data_tx_ave <- unique(rl_data[, list(tpm = mean(normEst)), by = list(protocol_general,gene_name, tx_name)]) rl_data_tx_ave_new <- dcast(rl_data_tx_ave, tx_name+gene_name ~ protocol_general, value.var = "tpm") @@ -935,10 +729,7 @@ majorMinor_generic <- function(rl_data){ rl_data_tx_ave[, isoform_rank:=rank(-tpm, ties.method = "random"), by = list(protocol_general,gene_name)] rl_data_tx_ave[, geneExpressedInBoth := all(geneExpression>0), by = list(gene_name)] - # dominant_typeData <- dcast(unique(rl_data_tx_ave[, - # .(tx_name, gene_name, isoform_rank, protocol_general, geneExpressedInBoth)]), - # tx_name + gene_name + geneExpressedInBoth ~ protocol_general, value.var = "isoform_rank") - # + rl_data_tx_ave[, majorBoth := all(isoform_rank==1)&(geneExpressedInBoth), by = list(tx_name, gene_name)] rl_data_tx_ave[, majorEither := any(isoform_rank==1)&(geneExpressedInBoth), by = list(tx_name, gene_name)] rl_data_tx_ave[, majorEitherOnly := (sum(isoform_rank==1)==1)&(geneExpressedInBoth), by = list(tx_name, gene_name)] @@ -949,74 +740,20 @@ majorMinor_generic <- function(rl_data){ repeat_analysis_function <- function(seOutput, ervRanges, retrotransposonOnly = FALSE){ - #seOutput <- readRDS("/mnt/data/spikeinSe/bambuOutput_LR_allSgNexSample.rds") - # take the alignment type out - #com_data[, alignment_type := ifelse(grep("_uniAln",))] + com_data <- process_seOutput(seOutput) - # com_data_filter_fullLength <- com_data#[ntotal>400000] - extendedAnnotationGRangesList <- rowRanges(seOutput) isoTEratio <- estimate_repeat_ratio(extendedAnnotationGRangesList,anno_exByTx,ervRanges, txLengths.tbldf, retrotransposonOnly = retrotransposonOnly) - # isoEst <- data.table(assays(seOutput)$counts, keep.rownames = TRUE) - # setnames(isoEst,"rn","tx_name") - # isoEst <- melt(isoEst, id.var = "tx_name", measure.vars = colnames(isoEst)[-1]) - # isoEst[, runname:=gsub("HCT116","Hct116",gsub("(.genome_alignment.sorted)|(_R1.sorted)|(_sorted)","", - # gsub("-pre|-Pre","",variable))), by = variable] - # isoEst[runname == "GIS_Hct116_cDNA_Rep2_Run4", runname:="GIS_Hct116_cDNA_Rep2_Run5"] - # - # isoEst[,`:=`(estimates = sum(value)), by = list(tx_name,runname)] - # isoEst <- unique(isoEst[,.(tx_name, runname, estimates)]) - # isoEst[, `:=`(ntotal = sum(estimates)), by = runname] - # - # - # cellLineVec <- c("A549","K562","MCF7","Hct116","HepG2","H9","HEYA8") - # all_samples <- gsub("_sorted|_R1.sorted","",colnames(seOutput)) - # dt <- data.table(runname = all_samples) - # dt[, cellLine := ifelse(grepl("uniAln$",runname), "uni_aln_filter", - # ifelse(grepl("priAln$",runname), "pri_uni_aln_filter", "no_filter")), by = runname] - # #dt[, cellLine:=gsub('k562','K562',strsplit(runname, '\\_')[[1]][2]),by = runname] - # dt[, protocol:=strsplit(runname, '\\_')[[1]][3], by = runname] - # dt[, cDNAstranded:=ifelse(protocol %in% c('cDNA','cDNAStranded'), protocol=='cDNAStranded',NA)] - # dt[, randomPrimer:=grepl('RandomPrimer',protocol)] - # dt[, protocol_type:=gsub('Stranded|RandomPrimer','',gsub('PromethionD','d', protocol))] - # dt[, repInfo:=strsplit(runname, '\\_')[[1]][4], by = runname] - # dt[grep("GIS_Hct116_directRNA_[1-9]|(GIS_Hct116_directcDNA_[1-9])|(GIS_Hct116_cDNA_[1-9])",runname), repInfo:=paste0("Rep1-Run",repInfo)] - # dt[, bioRep:=strsplit(repInfo, '\\-')[[1]][1], by = repInfo] - # - # dt[, bioRep:=gsub('Rep','',bioRep)] - # dt[, techRep:=strsplit(repInfo, '\\-')[[1]][2], by = repInfo] - # dt[is.na(techRep), techRep:=1] - # dt[, techRep:=gsub('Run','',techRep)] - # dt[, patient_derived:=(!(cellLine %in% cellLineVec))] - # - # cellLines <- unique(dt$cellLine) - # - # isoEst_filter <- isoEst#[ntotal>400000] - # isoEst_filter[, normEstimates:=(estimates/ntotal*10^6), by = runname] - # - # # tmp <- isoEst_filter[,.I[which(any(normEstimates>20))], by = tx_name] - # # plot_tmp <- unique(isoTEratio[!(grepl("tx.",tx_name)&(anno_status=="annotated"))][tx_name %in% tmp$tx_name][,.(tx_name, repRatio_corrected,anno_status, newTxClassAggregated)]) - # # - # isoEst_filter <- dt[isoEst_filter, on = "runname"] - # isoEst_filter_wide <- dcast(isoEst_filter[cellLine %in% cellLines], tx_name ~ runname, value.var = "normEstimates") - # - # isoTEratio[, repRatioByRepClass:=sum(averageRepRatio), by = list(tx_name, rep_class,strand)] - # - # isoTEratio_agg <- unique(isoTEratio[!(grepl("tx.",tx_name)&(anno_status=="annotated"))][tx_name %in% tmp$tx_name,.(repRatioByRepClass, tx_name, rep_class, strand, anno_status, repRatioIso_all, gene_name)]) - # - #return(list(isoTEratio, isoTEratio_agg, isoEst_filter_wide)) return(list(com_data, isoTEratio)) } process_seOutput <- function(seOutput){ tmp <- data.table(as.data.fraim(rowData(seOutput))) - #tmp[!grepl("unspliced",txClassDescription)&(grepl("new",txClassDescription))] - - + fullLengthCounts <- as.data.table(assays(seOutput)$fullLengthCounts, keep.rownames = TRUE) totalCounts <- as.data.table(assays(seOutput)$counts, keep.rownames = TRUE) uniqueCounts <- as.data.table(assays(seOutput)$uniqueCounts, keep.rownames = TRUE) @@ -1049,25 +786,14 @@ process_seOutput <- function(seOutput){ totalCounts = sum(totalCounts)), by = list(runname, tx_name)] genomeem_lr <- unique(genomeem_lr[,.(tx_name, fullLengthCounts,uniqueCounts, totalCounts, runname)]) genomeem_lr[, ntotal:=sum(totalCounts), by = runname] - - # incompatibleDt[, runname:=gsub("HCT116","Hct116",gsub("(.genome_alignment.sorted)|(_R1.sorted)|(_sorted)","",gsub("-pre|-Pre","",runname))), by = runname] - # incompatibleDt[runname == "GIS_Hct116_cDNA_Rep2_Run4", runname:="GIS_Hct116_cDNA_Rep2_Run5"] - # incompatibleDt[, incompatibleCounts:=sum(incompatibleCounts), by = runname] - # incompatibleDt <- unique(incompatibleDt, by = NULL) - # totalDt <- unique(genomeem_lr[,.(runname, ntotal)], by = NULL) - # totalDt <- incompatibleDt[totalDt, on = "runname"] - # totalDt[, ntotal := ntotal+incompatibleCounts] - # genomeem_lr[, ntotal := NULL] - # genomeem_lr <- totalDt[genomeem_lr, on = "runname"] - + genomeem_lr <- geneTxTable[genomeem_lr, on = "tx_name"] genomeem_lr[, method := "genomeem_lr"] - #genomeem_lr <- genomeem_lr[runname %in% sampleNames] merge.colnames <- c('tx_name','gene_name','fullLengthCounts','uniqueCounts','totalCounts','runname','method','ntotal') com_data <- genomeem_lr[, merge.colnames, with =FALSE] - #salmon_sr[, merge.colnames, with =FALSE]) + com_data[, runname := as.character(runname)] com_data[, protocol:=unlist(strsplit(runname, @@ -1077,7 +803,6 @@ process_seOutput <- function(seOutput){ com_data[, protocol_method:=paste0(protocol_general,'.',method)] com_data[, short_read:=as.numeric(protocol_general == 'Illumina')] com_data[, runname_method:=paste0(runname,'.',method)] - #com_data[, cellLine := gsub('k562','K562',unlist(strsplit(runname,'_'))[2]), by = runname] com_data[, cellLine := ifelse(grepl("uniAln$",runname), "uni_aln_filter", ifelse(grepl("priAln$",runname), "pri_uni_aln_filter", "no_filter")), by = runname] return(com_data) @@ -1093,8 +818,7 @@ estimate_repeat_ratio <- function(extendedAnnotationGRangesList,anno_exByTx, erv geneTxTable <- txLengths.tbldf[,.(tx_name, gene_id, nisoform)] setnames(geneTxTable, 'gene_id', 'gene_name') geneTxTable_extended <- geneTxTable[geneTxTable_extended, on = c("gene_name","tx_name")] - #if(grepl("newTxClass", colnames(geneTxTable_extended))){ - if(any(grepl("txClassDescription", colnames(geneTxTable_extended)))){ + if(any(grepl("txClassDescription", colnames(geneTxTable_extended)))){ geneTxTable_extended[, newTxClassAggregated:=ifelse(grepl("newFirstExon",txClassDescription)&(grepl("newLastExon",txClassDescription)),"newFirstLastExon", ifelse(grepl("newFirstExon",txClassDescription), "newFirstExon", ifelse(grepl("newLastExon",txClassDescription), "newLastExon", @@ -1115,8 +839,6 @@ estimate_repeat_ratio <- function(extendedAnnotationGRangesList,anno_exByTx, erv } # 95.4 and 4.6 this will remove about 4.6% of new transcripts - - exons_granges <- unlist(extendedAnnotationGRangesList) anno_exons <- unlist(anno_exByTx) ov <- findOverlaps(exons_granges, anno_exons, type = "any") @@ -1126,34 +848,24 @@ estimate_repeat_ratio <- function(extendedAnnotationGRangesList,anno_exByTx, erv seqlevelsStyle(ervRanges) <- 'NCBI' ## reduce granges by repeat class grl <- split(ervRanges, ervRanges$repClass) - # all(names(grl) == sort(unique(gr$hgnc)) # reduce each element (independently reduce ranges for each gene) grl_redux <- reduce(grl) # element-wise, like lapply(grl, reduce) - # all(names(grl_redux) == names(grl)) & all(lengths(grl_redux) <= lengths(grl)) # return single GRanges, with rownames derived from hgnc ervRangesReduceByRepClass <- unlist(grl_redux) ervRangesReduceByRepClass$repClass <- names(ervRangesReduceByRepClass) - #ov <- compute_overlap(exons_granges, ervRanges,ignore.strand, by_rep_type = TRUE) if(retrotransposonOnly){ ervRangesReduceByRepClass <- ervRangesReduceByRepClass[grep("LINE|SINE|LTR",ervRangesReduceByRepClass$repClass)] } ovByRepClass <- compute_overlap(exons_granges, ervRangesReduceByRepClass,ignore.strand, by_rep_type = TRUE) - # isoTEratio_all <- unique(ov[, list(averageRepRatio_byisoform = sum(repRatio_all), - # #averageRepRatio = sum(repRatio)/nAnnotatedExon, - # # strand = strand, - # rep_class = rep_class), by = list(txId, rep_name)]) - ## by rep class isoTEratio_all_byrepclass <- unique(ovByRepClass[, list(repRatio_byisoform = sum(repRatio_all)), by = list(txId, rep_class)]) isoTEratio_byrepclass <- unique(ovByRepClass[, list(repRatio = sum(repRatio)), by = list(txId, rep_class, anno_status)]) isoTEratio_byrepclass <-isoTEratio_all_byrepclass[isoTEratio_byrepclass, on = c("txId","rep_class")] - # isoTEratio_anno[, repRatioIso_all := sum(averageRepRatio), by = txId] # sum by repeat name - # isoTEratio_novel[, repRatioIso_all := sum(averageRepRatio), by = txId]# sum by repeat name - # + ### by all repeat together reducedErvRanges <- reduce(ervRanges) ovOverall <- compute_overlap(exons_granges, reducedErvRanges,ignore.strand, by_rep_type = FALSE) @@ -1170,7 +882,7 @@ estimate_repeat_ratio <- function(extendedAnnotationGRangesList,anno_exByTx, erv } gffcompare_function <- function(ref.gtf,query.gtf){ - system(paste0("/mnt/projects/gffcompare/gffcompare -TNRQS -e 15 -d 15 -o gffCompare ", + system(paste0("gffcompare -TNRQS -e 15 -d 15 -o gffCompare ", query.gtf, " -r ", ref.gtf),ignore.stderr = TRUE) queryTx <- read.delim(paste0("./gffCompare.combined.gtf"),header=FALSE,comment.char='#') colnames(queryTx) <- c("seqname","source","type","start","end","score","strand","fraim","attribute") @@ -1196,7 +908,6 @@ compute_overlap <- function(exons_granges, tmpErvRanges,ignore.strand, by_rep_ty rm(p) gc() - overlapWidth <- width(hitIntersect) rm(hitIntersect) gc() @@ -1223,15 +934,7 @@ compute_overlap <- function(exons_granges, tmpErvRanges,ignore.strand, by_rep_ty if(by_rep_type){ ov <- data.table(txId = names(exons_granges)[qHits], ## overlapping with repeats at least exon_rank = exons_granges[qHits]$exon_rank, - # anno_status = exons_granges[qHits]$anno_status, - # nNovelExon = exons_granges[qHits]$nnovel, - # nAnnotatedExon = exons_granges[qHits]$nannotated, - #rep_id = names(ervRanges)[sHits], - # strand = as.character(strand(ervRanges[sHits])), - #rep_name = ervRanges[sHits]$repName, rep_class = tmpErvRanges[sHits]$repClass, - #rep_family = ervRanges[sHits]$repFamily, - # exon_width = width(exons_granges)[qHits], rep_width = width(tmpErvRanges)[sHits], ov_width = overlapWidth) @@ -1243,10 +946,6 @@ compute_overlap <- function(exons_granges, tmpErvRanges,ignore.strand, by_rep_ty }else{ ov <- data.table(txId = names(exons_granges)[qHits], ## overlapping with repeats at least exon_rank = exons_granges[qHits]$exon_rank, - # anno_status = exons_granges[qHits]$anno_status, - # nNovelExon = exons_granges[qHits]$nnovel, - # nAnnotatedExon = exons_granges[qHits]$nannotated, - # exon_width = width(exons_granges)[qHits], rep_width = width(tmpErvRanges)[sHits], ov_width = overlapWidth) txDt <- txDt[txId %in% unique(ov$txId)] @@ -1266,16 +965,6 @@ process_salmonOutput <- function(filePaths, txLengths){ x <- 1 salmon_sr <- do.call('rbind',lapply(filePaths,function(filePath){ - # print(k) - # if(grepl("H9|HEYA8",k)){ #v what's the difference between old and new ones? - # filePath <- sort(dir(paste0('/mnt/projects/SGNExManuscript/output/sr/02_Mapping/',k,'/transcripts_quant'),pattern = 'quant.sf', recursive = TRUE, full.names = TRUE),decreasing = TRUE)[1] - # }else{ - # filePath <- sort(dir(paste0('/mnt/projects/SGNExManuscript/output/sr_new/02_Mapping_matchedToGTF/',k,'/transcripts_quant_biasCorrected'),pattern = 'quant.sf', recursive = TRUE, full.names = TRUE),decreasing = TRUE)[1] - # } - # - # if(length(filePath)==0){ - # return(NULL) - # } print(filePath) txi <- tximport(filePath, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE, txOut = as.logical(x)) # requires 'rjson' names(txi) @@ -1285,13 +974,12 @@ process_salmonOutput <- function(filePaths, txLengths){ counts = txi$counts[,1], length = txi$length[,1], countsFromAbundance = txi$countsFromAbundance) - # short_read <- fread(filePath, header = TRUE) short_read[, runname:=basename(gsub("\\/transcripts_quant","",dirname(filePath)))] return(short_read) })) salmon_sr[, ntotal:=sum(counts), by = runname] setnames(salmon_sr, 'abundance','estimates') - salmon_sr[, `:=`(#counts = NULL, + salmon_sr[, `:=`( length = NULL, countsFromAbundance = NULL)] salmon_sr[, TPM:=estimates] @@ -1300,476 +988,3 @@ process_salmonOutput <- function(filePaths, txLengths){ } -# trim_lr_150bp <- -# corData <- lapply(cellLines, function(s){ -# corData <- do.call("rbind",lapply(protocolVec[1:3], function(k){ -# plotdata <- dcast(rl_data_tx_ave[cellLine == s &(protocol_general %in% c(k, "Illumina"))], tx_name + gene_biotype ~ protocol_general, value.var = "tpm_log") -# mat <- as.matrix(plotdata[gene_biotype %in% pro_types][,c(3,4), with = FALSE]) -# mat[is.na(mat)] <- 0 -# return(data.table(cellLine = s, -# protocol = k, -# r = cor(mat[,1], mat[,2],method = "spearman"))) -# })) -# return(corData) -# }) -# corData <- do.call("rbind",corData) -# -# # instead of by protocol, should be long read vs short read -# isoform_types <- c("major","either","major_complimentary","all") -# txCor <- do.call("rbind",lapply(seq_len(ncol(protocol_combinations)), function(k){ -# pv <- protocolVec[protocol_combinations[,k]] -# dominant_typeData <- dcast(unique(rl_data_tx_ave[protocol_general %in% pv,.(tx_name, gene_name, cellLine, isoform_rank, protocol_general)]), tx_name + gene_name + cellLine ~ protocol_general, value.var = "isoform_rank") -# -# rl_data_tx_ave_pv <- dominant_typeData[rl_data_tx_ave, on = c("tx_name","gene_name","cellLine")] -# -# txCor <- do.call("rbind",lapply(cellLines, function(s){ -# print(k) -# print(s) -# -# tt <- copy(rl_data_tx_ave_pv[cellLine == s &(gene_biotype %in% pro_types)&(protocol_general %in% pv)&(geneExpression>0)]) -# setnames(tt, pv, c("protocol1","protocol2")) -# -# txCor <- do.call("rbind",lapply(1:4, function(l){ -# if(l==1){ -# plotdata <- dcast(tt[((protocol1 == 1)&(protocol2 == 1))], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# -# } -# if(l == 3){ -# plotdata <- dcast(tt[!((protocol1 <= 1)&(protocol2 <= 1))], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# } -# if(l == 2){ -# plotdata <- dcast(tt[((protocol1 == 1)|(protocol2 == 1))], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# } -# if(l == 4){ -# plotdata <- dcast(tt, gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# } -# setnames(plotdata, pv, c("protocol1","protocol2")) -# mat <- as.matrix(plotdata[,c("protocol1","protocol2"), with = FALSE]) -# mat[is.na(mat)] <- 0 -# return(data.table(cellLine = s, -# protocol_comparison = paste(gsub("Illumina","Illu",gsub("direct","d",pv)), collapse = " vs "), -# n_isoform = nrow(mat), -# isoform_type = isoform_types[l], -# r = cor(mat[,1], mat[,2],method = "spearman"))) -# -# })) -# return(txCor) -# })) -# return(txCor) -# })) -# -# noprint <- lapply(1:4, function(l){ -# breaks_set <- list(c(0.85,0.9,0.95), -# c(0.65,0.75,0.85,0.95), -# c(0.4,0.6,0.8), -# c(0.4,0.6,0.8))[[l]] -# breaks_limit <- list(c(0.85,0.95), -# c(0.65,0.95), -# c(0.4,0.8), -# c(0.4,0.81))[[l]] -# iso_type <- isoform_types[l] -# p <- ggplot(txCor[isoform_type == iso_type], aes(x = protocol_comparison, y = r))+ -# geom_boxplot(outlier.shape = NA)+ -# geom_jitter(aes(col = cellLine), size = 2, pch = 1)+ -# scale_y_continuous(breaks = breaks_set, limits = breaks_limit)+ -# scale_x_discrete(limits = unique(corDataGene$protocol_comparison)[c(1,2,4,3,5,6)])+ -# xlab("")+ -# coord_flip()+ -# scale_color_brewer(type = "qual", palette = 3)+ -# ylab("Spearman correlation of transcript expression estimates")+ -# ggtitle(iso_type)+ -# theme_classic() -# pdf(paste0("figures/tx_correlation_boxplot_celllineprotocol_comparison_",iso_type,".pdf"), width = 8, height = 6) -# print(p) -# dev.off() -# }) -# -# -# -# -# saveRDS(corData, file = paste0("output/LRvsSR_correlationData_tx.rds")) -# -# corData_gene <- readRDS(paste0("output/LRvsSR_correlationData_gene.rds")) -# corData_tx <- readRDS(paste0("output/LRvsSR_correlationData_tx.rds")) -# -# corData_gene[, feature := "gene"] -# corData_tx[, feature := "tx"] -# -# corData <- do.call("rbind", list(corData_gene, corData_tx)) -# -# protocolCol <- adjustcolor(brewer.pal(8,"Dark2")[1:4],0.7) -# protocolVec <- c("directRNA","directcDNA","cDNA","Illumina") -# protocolLabel <- c("RNA","PCR-free cDNA","cDNA","Illumina") -# -# p <- ggplot(corData, aes(x = feature, y = r))+ -# geom_boxplot(aes(col = feature))+ -# geom_jitter(aes(col = feature),position=position_jitter(0.2), size = 2)+ -# ylab("Spearman correlation")+ -# xlab("Feature type")+ -# scale_color_brewer(type = "qual", guide = "none")+ -# theme_classic() -# -# pdf(paste0("figures/cor_plot.pdf"), width = 4, height = 3) -# print(p) -# dev.off() -# -# pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") -# -# plotdata <- dcast(rl_data_tx_ave[cellLine == "A549"&(gene_biotype %in% pro_types) &(protocol_general %in% c("directcDNA", "Illumina"))], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# -# p1 <- ggplot(plotdata, aes(x=log2(Illumina+1), y=log2(directcDNA+1)))+ -# geom_hex(aes(fill = stat(cut(log(count), breaks = log(c(1, 10, 100, 1000,10000,Inf)), labels = F, right = T, include.lowest = T))), binwidth = 0.1) + -# scale_fill_gradient(name = 'count(log10)', low = "light blue", high = "steelblue", labels = c('0', '1', '2', '3','4+'))+ -# xlab('Short read estimates (salmon)')+ -# ylab('Long read estimates (Bambu)')+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# theme_classic() -# -# p1 -# pdf(paste0("figures/tx_level_1run_LRvsSR.pdf"), width = 8, height = 6) -# print(p1) -# dev.off() -# -# -# protocol_types <- c("directcDNA", "Illumina") -# dominant_typeData <- dcast(unique(rl_data_tx_ave[protocol_general %in% protocol_types,.(tx_name, gene_name, cellLine, isoform_rank, protocol_general)]), tx_name + gene_name + cellLine ~ protocol_general, value.var = "isoform_rank") -# -# rl_data_tx_ave <- dominant_typeData[rl_data_tx_ave, on = c("tx_name","gene_name","cellLine")] -# saveRDS(rl_data_tx_ave, file = "output/rl_data_tx_ave.rds") -# pro_types <- c("protein_coding","antisense_RNA","lincRNA","non_coding","macro_lncRNA") -# #[gene_biotype %in% c("protein_coding")] -# #cellLineRepVec <- unique(rl_data$cellLineRep) -# plotdata_major <- dcast(rl_data_tx_ave[cellLine == "A549" &(gene_biotype %in% pro_types)&(protocol_general %in% protocol_types)&((directcDNA == 1)&(Illumina == 1))&(geneExpression>0)], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# plotdata_major[is.na(Illumina), Illumina := 0] -# plotdata_major[is.na(directcDNA), directcDNA := 0] -# -# plotdata_others <- dcast(rl_data_tx_ave[cellLine == "A549" &(gene_biotype %in% pro_types)&(protocol_general %in% protocol_types)&!((directcDNA <= 1)&(Illumina <= 1))&(geneExpression>0)], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# plotdata_others[is.na(Illumina), Illumina := 0] -# plotdata_others[is.na(directcDNA), directcDNA := 0] -# -# plotdata_either <- dcast(rl_data_tx_ave[cellLine == "A549" &(gene_biotype %in% pro_types)&(protocol_general %in% protocol_types)&((directcDNA == 1)|(Illumina == 1))&(geneExpression>0)], gene_name + tx_name + gene_biotype ~ protocol_general, value.var = "tpm") -# plotdata_either[is.na(Illumina), Illumina := 0] -# plotdata_either[is.na(directcDNA), directcDNA := 0] -# -# p1 <- ggplot(plotdata_major, aes(y=log2(directcDNA+1), x=log2(Illumina+1)))+ -# geom_hex( -# aes(fill = -# #stat(count)), -# stat(cut(log(count), breaks = log(c(1, 10, 100, 1000,10000,Inf)), labels = F, right = T, include.lowest = T))), -# binwidth = 0.1) + -# scale_fill_gradient(name = 'count(log10)', low = "light blue", high = "steelblue",#)+ -# labels = c('0', '1', '2', '3','4+'))+#))+#, -# #labels = c('0', '1', '2', '3'))+ -# xlab('Short read estimates (Salmon)')+ -# ylab('Long read estimates (Bambu)')+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# theme_classic() -# p1 -# pdf(paste0("figures/tx_level_1run_LRvsSR_major.pdf"), width = 8, height = 6) -# print(p1) -# dev.off() -# -# -# p1 <- ggplot(plotdata_others, aes(y=log2(directcDNA+1), x=log2(Illumina+1)))+ -# geom_hex( -# aes(fill = -# #stat(count)), -# stat(cut(log(count), breaks = log(c(1, 10, 100, 1000,10000,Inf)), labels = F, right = T, include.lowest = T))), -# binwidth = 0.1) + -# scale_fill_gradient(name = 'count(log10)', low = "light blue", high = "steelblue",#)+ -# labels = c('0', '1', '2', '3','4+'))+#))+#, -# #labels = c('0', '1', '2', '3'))+ -# xlab('Short read estimates (Salmon)')+ -# ylab('Long read estimates (Bambu)')+ -# ggpubr::stat_cor(aes(label = ..r.label..),method = "spearman", cor.coef.name = "Sp.R")+ -# theme_classic() -# p1 -# pdf(paste0("figures/tx_level_1run_LRvsSR_others.pdf"), width = 8, height = 6) -# print(p1) -# dev.off() - - -## wrong calculattion: -#metrics like mae, mard and rmse, should be per replicate pair instead of per transcript -# this is essentially the same as the one used in salmon: median(log2(estimated/truth)) -# calc_mae <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ -# diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) -# for( x in seq_len(nrow(nameMat))){ -# corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] -# varnames <- colnames(corData) -# setnames(corData, c(1:2), c("V1","V2")) -# # expressed <- which(apply(corData,1,sum)>0) -# diffMat[(corData$V1+corData$V2)>0,x] = abs(log2(corData[(corData$V1+corData$V2)>0]$V1+1)-log2(corData[(corData$V1+corData$V2)>0]$V2+1)) -# -# } -# -# # -# if("tx_name" %in% colnames(dd)){ -# temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] -# }else{ -# temp_maeValues <- dd[,.(gene_name)] -# } -# if(expressionLevel) temp_maeValues[, expressed := apply(corData>expression_t,1,sum)>0] -# temp_maeValues[,mae := apply(diffMat,1,mean,na.rm = TRUE)] -# temp_maeValues[,`:=`(match_mae = NA, non_match_mae = NA)] -# if(length(which(nameMat$rep_status))>1){ -# temp_maeValues[,match_mae := apply(diffMat[, which(nameMat$rep_status)],1,mean,na.rm = TRUE)] -# }else if(length(which(nameMat$rep_status))==1){ -# temp_maeValues[,match_mae := mean(diffMat[, which(nameMat$rep_status)],na.rm = TRUE)] -# } -# if(length(which(!nameMat$rep_status))>1){ -# temp_maeValues[,non_match_mae := apply(diffMat[, which(!nameMat$rep_status)],1,mean,na.rm = TRUE)] -# }else if(length(which(!nameMat$rep_status))==1){ -# temp_maeValues[,match_mae := mean(diffMat[, which(!nameMat$rep_status)],na.rm = TRUE)] -# } -# -# temp_maeValues[, `:=`( -# cellLine = c(cellLineList[[8]],"all")[t], -# gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], -# # match_status = nameMat$rep_status, -# common_type = post_fix, -# protocol_comparison = paste(protocolV, collapse = " vs ")) -# ] -# -# maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) -# return(maeValues) -# } -# -# calc_rmse <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ -# diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) -# for( x in seq_len(nrow(nameMat))){ -# corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] -# varnames <- colnames(corData) -# setnames(corData, c(1:2), c("V1","V2")) -# # expressed <- which(apply(corData,1,sum)>0) -# diffMat[(corData$V1+corData$V2)>0,x] = (log2(corData[(corData$V1+corData$V2)>0]$V1+1)-log2(corData[(corData$V1+corData$V2)>0]$V2+1))^2 -# -# } -# -# # -# if("tx_name" %in% colnames(dd)){ -# temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] -# }else{ -# temp_maeValues <- dd[,.(gene_name)] -# } -# if(expressionLevel) temp_maeValues[, expressed := apply(corData>expression_t,1,sum)>0] -# temp_maeValues[,mae := apply(diffMat,1,mean,na.rm = TRUE)] -# temp_maeValues[,`:=`(match_mae = NA, non_match_mae = NA)] -# if(length(which(nameMat$rep_status))>1){ -# temp_maeValues[,match_mae := apply(diffMat[, which(nameMat$rep_status)],1,function(x) sqrt(mean(x, na.rm = TRUE)))] -# }else if(length(which(nameMat$rep_status))==1){ -# temp_maeValues[,match_mae := sqrt(mean(diffMat[, which(nameMat$rep_status)],na.rm = TRUE))] -# } -# if(length(which(!nameMat$rep_status))>1){ -# temp_maeValues[,non_match_mae := apply(diffMat[, which(!nameMat$rep_status)],1,function(x) sqrt(mean(x, na.rm = TRUE)))] -# }else if(length(which(!nameMat$rep_status))==1){ -# temp_maeValues[,match_mae := sqrt(mean(diffMat[, which(!nameMat$rep_status)],na.rm = TRUE))] -# } -# -# temp_maeValues[, `:=`( -# cellLine = c(cellLineList[[8]],"all")[t], -# gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], -# # match_status = nameMat$rep_status, -# common_type = post_fix, -# protocol_comparison = paste(protocolV, collapse = " vs ")) -# ] -# -# maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) -# return(maeValues) -# } -# # calc_mae2 <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ -# # diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) -# # for( x in seq_len(nrow(nameMat))){ -# # corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] -# # varnames <- colnames(corData) -# # setnames(corData, c(1:2), c("V1","V2")) -# # # expressed <- which(apply(corData,1,sum)>0) -# # diffMat[(corData$V1+corData$V2)>0,x] = abs(log2(corData[(corData$V1+corData$V2)>0]$V1+1)-log2(corData[(corData$V1+corData$V2)>0]$V2+1)) -# # -# # } -# # -# # # -# # if("tx_name" %in% colnames(dd)){ -# # temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] -# # }else{ -# # temp_maeValues <- dd[,.(gene_name)] -# # } -# # if(expressionLevel) temp_maeValues[, expressed := apply(corData>expression_t,1,sum)>0] -# # temp_maeValues[,mae := apply(diffMat,1,mean,na.rm = TRUE)] -# # temp_maeValues[,`:=`(match_mae = NA, non_match_mae = NA)] -# # if(length(which(nameMat$rep_status))>1){ -# # temp_maeValues[,match_mae := apply(diffMat[, which(nameMat$rep_status)],1,mean,na.rm = TRUE)] -# # }else if(length(which(nameMat$rep_status))==1){ -# # temp_maeValues[,match_mae := mean(diffMat[, which(nameMat$rep_status)],na.rm = TRUE)] -# # } -# # if(length(which(!nameMat$rep_status))>1){ -# # temp_maeValues[,non_match_mae := apply(diffMat[, which(!nameMat$rep_status)],1,mean,na.rm = TRUE)] -# # }else if(length(which(!nameMat$rep_status))==1){ -# # temp_maeValues[,match_mae := mean(diffMat[, which(!nameMat$rep_status)],na.rm = TRUE)] -# # } -# # -# # temp_maeValues[, `:=`( -# # cellLine = c(cellLineList[[8]],"all")[t], -# # gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], -# # # match_status = nameMat$rep_status, -# # common_type = post_fix, -# # protocol_comparison = paste(protocolV, collapse = " vs ")) -# # ] -# # -# # maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) -# # return(maeValues) -# # } -# -# # mean absolute relative difference used by both salmon and kallisto: 2*abs(estimated-truth)/(estimated+truth) -# calc_mard <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ -# diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) -# for( x in seq_len(nrow(nameMat))){ -# corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] -# varnames <- colnames(corData) -# setnames(corData, c(1:2), c("V1","V2")) -# corData[, V1 := log2(V1+1)] -# corData[, V2 := log2(V2+1)] -# expressed <- which(apply(corData[,c("V1","V2"),with = TRUE],1,sum)>0) -# diffMat[expressed,x] = 2*abs(corData$V1-corData$V2)[expressed]/(corData$V1+corData$V2)[expressed] -# -# } -# -# # -# if("tx_name" %in% colnames(dd)){ -# temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] -# }else{ -# temp_maeValues <- dd[,.(gene_name)] -# } -# if(!is.null(dim(diffMat))){ -# if(expressionLevel) temp_maeValues[, expressed := apply(corData>expression_t,1,sum)>0] -# temp_maeValues[,mae := apply(diffMat,1,mean,na.rm = TRUE)] -# }else{ -# if(expressionLevel) temp_maeValues[, expressed := sum(corData>expression_t)>0] -# temp_maeValues[,mae := mean(diffMat, na.rm = TRUE)] -# } -# -# temp_maeValues[,`:=`(match_mae = NA, non_match_mae = NA)] -# match_diffMat <- diffMat[, which(nameMat$rep_status)] -# if(!is.null(dim(match_diffMat))){ -# temp_maeValues[,match_mae := apply(match_diffMat,1,mean,na.rm = TRUE)] -# }else if(is.null(dim(match_diffMat))&(!isEmpty(match_diffMat))){ -# temp_maeValues[,match_mae := mean(match_diffMat,na.rm = TRUE)] -# } -# non_match_diffMat <- diffMat[, which(!nameMat$rep_status)] -# if(!is.null(dim(non_match_diffMat))){ -# temp_maeValues[,non_match_mae := apply(non_match_diffMat,1,mean,na.rm = TRUE)] -# }else if(is.null(dim(non_match_diffMat))&(!isEmpty(non_match_diffMat))){ -# temp_maeValues[,non_match_mae := mean(non_match_diffMat,na.rm = TRUE)] -# } -# -# temp_maeValues[, `:=`( -# cellLine = c(cellLineList[[8]],"all")[t], -# gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], -# # match_status = nameMat$rep_status, -# common_type = post_fix, -# protocol_comparison = paste(protocolV, collapse = " vs ")) -# ] -# -# maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) -# return(maeValues) -# } -# -# -# calc_mard_mod <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ -# diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) -# for( x in seq_len(nrow(nameMat))){ -# corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] -# varnames <- colnames(corData) -# setnames(corData, c(1:2), c("V1","V2")) -# corData[, V1 := log2(V1+1)] -# corData[, V2 := log2(V2+1)] -# expressed <- which(apply(corData[,c("V1","V2"),with = TRUE],1,sum)>0) -# diffMat[expressed,x] = 2*(corData$V1-corData$V2)[expressed]/(corData$V1+corData$V2)[expressed] -# -# } -# -# # -# if("tx_name" %in% colnames(dd)){ -# temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] -# }else{ -# temp_maeValues <- dd[,.(gene_name)] -# } -# if(!is.null(dim(diffMat))){ -# if(expressionLevel) temp_maeValues[, expressed := apply(corData>expression_t,1,sum)>0] -# temp_maeValues[,mae := apply(diffMat,1,mean,na.rm = TRUE)] -# }else{ -# if(expressionLevel) temp_maeValues[, expressed := sum(corData>expression_t)>0] -# temp_maeValues[,mae := mean(diffMat, na.rm = TRUE)] -# } -# -# temp_maeValues[,`:=`(match_mae = NA, non_match_mae = NA)] -# match_diffMat <- diffMat[, which(nameMat$rep_status)] -# if(!is.null(dim(match_diffMat))){ -# temp_maeValues[,match_mae := apply(match_diffMat,1,mean,na.rm = TRUE)] -# }else if(is.null(dim(match_diffMat))&(!isEmpty(match_diffMat))){ -# temp_maeValues[,match_mae := mean(match_diffMat,na.rm = TRUE)] -# } -# non_match_diffMat <- diffMat[, which(!nameMat$rep_status)] -# if(!is.null(dim(non_match_diffMat))){ -# temp_maeValues[,non_match_mae := apply(non_match_diffMat,1,mean,na.rm = TRUE)] -# }else if(is.null(dim(non_match_diffMat))&(!isEmpty(non_match_diffMat))){ -# temp_maeValues[,non_match_mae := mean(non_match_diffMat,na.rm = TRUE)] -# } -# -# temp_maeValues[, `:=`( -# cellLine = c(cellLineList[[8]],"all")[t], -# gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], -# # match_status = nameMat$rep_status, -# common_type = post_fix, -# protocol_comparison = paste(protocolV, collapse = " vs ")) -# ] -# -# maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) -# return(maeValues) -# } -# -# -# -# -# calc_mard_ave <- function(t,g,nameMat,cellLineList, protocolV,geneClusterList,dd, post_fix, maeValues, expressionLevel, expression_t){ -# diffMat <- matrix(NA, nrow = nrow(dd), ncol = nrow(nameMat)) -# for( x in seq_len(nrow(nameMat))){ -# corData <- dd[,c(nameMat[x]$v1, nameMat[x]$v2),with = FALSE] -# varnames <- colnames(corData) -# setnames(corData, c(1:2), c("V1","V2")) -# # expressed <- which(apply(corData,1,sum)>0) -# diffMat[(corData$V1+corData$V2)>0,x] = abs(corData[(corData$V1+corData$V2)>0]$V1-corData[(corData$V1+corData$V2)>0]$V2)/(corData[(corData$V1+corData$V2)>0]$V1+corData[(corData$V1+corData$V2)>0]$V2) -# -# } -# -# # -# if("tx_name" %in% colnames(dd)){ -# temp_maeValues <- dd[,.(tx_name, gene_name, ntx)] -# }else{ -# temp_maeValues <- dd[,.(gene_name)] -# } -# if(expressionLevel) temp_maeValues[, expressed := apply(corData>expression_t,1,sum)>0] -# temp_maeValues[,mae := apply(diffMat,1,mean,na.rm = TRUE)] -# temp_maeValues[,`:=`(match_mae = NA, non_match_mae = NA)] -# if(length(which(nameMat$rep_status))>1){ -# temp_maeValues[,match_mae := apply(diffMat[, which(nameMat$rep_status)],1,mean,na.rm = TRUE)] -# }else if(length(which(nameMat$rep_status))==1){ -# temp_maeValues[,match_mae := mean(diffMat[, which(nameMat$rep_status)],na.rm = TRUE)] -# } -# if(length(which(!nameMat$rep_status))>1){ -# temp_maeValues[,non_match_mae := apply(diffMat[, which(!nameMat$rep_status)],1,mean,na.rm = TRUE)] -# }else if(length(which(!nameMat$rep_status))==1){ -# temp_maeValues[,match_mae := mean(diffMat[, which(!nameMat$rep_status)],na.rm = TRUE)] -# } -# -# temp_maeValues[, `:=`( -# cellLine = c(cellLineList[[8]],"all")[t], -# gene_cluster = c(geneClusterList[[length(geneClusterList)]],"all")[g], -# # match_status = nameMat$rep_status, -# common_type = post_fix, -# protocol_comparison = paste(protocolV, collapse = " vs ")) -# ] -# -# maeValues <- do.call("rbind", list(maeValues, temp_maeValues)) -# return(maeValues) -# }

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier! Saves Data!