setwd("C:/Users/S Thakur/OneDrive/Documents/microarray_tutorial/test") install.packages("splitstackshape") install.packages("dplyr") install.packages("ggplot2") if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(c("GEOquery", "limma", "clusterProfiler", "affy", "oligo", "preprocessCore", "AnnotationDbi", "org.Hs.eg.db", "arrayQualityMetrics")) #---------------------------------------------------------- source("https://bioconductor.org/bioclite.r") if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(c("oligo"),force = TRUE) BiocManager::install(c("arrayQualityMetrics")) ------------------------------------------------------------- # Load required libraries library(GEOquery) library(limma) library(clusterProfiler) library(affy) library(oligo) library(clusterProfiler) library(preprocessCore) library(ggplot2) library(splitstackshape) library(dplyr) library(AnnotationDbi) library(org.Hs.eg.db) #Option1: Set GEO accession number geo_accession <- "GSE24460" # Download and load the data gse <- getGEO(geo_accession) data <- exprs(gse[[1]]) normalized_data <- normalize.quantiles(data) #------------------------------------------------------------------------------------- #Option2:Download cel files directly and make a similar expression matrix library("arrayQualityMetrics") celFiles <- list.celfiles() affyraw <- read.celfiles(celFiles) eset <- oligo::rma(affyraw) write.exprs(eset, file = "exp.txt") #Differential gene expression anslysis-------------------------------------------------- # Define groups group <- factor(c("parental", "parental","drugresistant","drugresistant")) # Create a design matrix design <- model.matrix(~0 + group) # Rename the columns to match your group names colnames(design) <- levels(group) #Differential gene expression analysis fit <- lmFit(eset,design) fit <- eBayes(fit) # Define contrasts (comparing parental_cell to doxorubicin_selected_subline) contrast_matrix <- makeContrasts(parental - drugresistant, levels = design) results <- contrasts.fit(fit, contrast_matrix) results <- eBayes(results) # Set your desired p-value threshold p_value_threshold <- 0.05 # Set your desired log fold change cutoff logFC_cutoff <- 2 # Extract significant genes with a p-value below the threshold and log fold change >= 2 sig_genes <- topTable(results, coef = 1, adjust.method = "fdr", number = Inf) # Filter based on p-value and log fold change sig_genes1 <- sig_genes[sig_genes$P.Value < p_value_threshold & abs(sig_genes$logFC) >= logFC_cutoff, ] # Save significant genes to a CSV file write.csv(sig_genes1, file = "significant_genes.csv") #Annotation using family soft file and Heat map generation----------------------------------------------------------- mydata <- read.delim("GSE24460_family.soft", check.names = FALSE) arr <- data.frame(mydata) aw <- cSplit(arr, "Gene.Symbol","//") write.table(aw, file="ano.txt", sep = "\t", row.names = FALSE, quote = FALSE) data1 <- read.delim("ano.txt", check.names = FALSE) data2 <- read.csv("significant_genes.csv", header = TRUE, sep = ",") combined <- inner_join(data1, data2, by = "ID") write.csv(combined, "annotated.csv") # Save the selected columns to a new CSV file annotated_data <- read.csv("annotated.csv", header = TRUE) selected_columns <- annotated_data[, c("ID", "GB_ACC", "ENTREZ_GENE_ID", "Gene.Symbol_01", "Gene.Title", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")] df <- subset(selected_columns, Gene.Symbol_01 != "NA") write.csv(df, file = "DGE_anno.csv", row.names = FALSE) #---------------------------------GO annotation---------------------------------------------------------- # Read the "annotation.csv" file annotation_data <- read.csv("annotated.csv") # Load the annotation database org_db <- org.Hs.eg.db # For Homo sapiens # Extract unique gene symbols from your annotation data unique_gene_symbols <- unique(annotation_data$Gene.Symbol_01) # Perform GO annotation lookup using gene symbols as the key go_annotations <- select(org_db, keys = unique_gene_symbols, keytype = "SYMBOL", columns = c("SYMBOL", "GO","GENENAME","ENTREZID","PATH"), multiVals = "list") # Save the go_annotations to a new CSV file write.csv(go_annotations, file = "annotated_data_with_GO.csv", row.names = FALSE) #------------------------------------ GO enrichment annotation------------------------------------- library(clusterProfiler) # Extract the gene symbols from the annotation_data gene_symbols <- annotation_data$ENTREZ_GENE_ID # Perform KEGG pathway annotation with a p-value cutoff ans.go <- enrichGO(gene = gene_symbols, ont = "BP", OrgDb ="org.Hs.eg.db", readable=TRUE, pvalueCutoff = 0.05) tab.go <- as.data.frame(ans.go) #Save the file write.csv(tab.go, file = "GO_BP.csv", row.names = FALSE) ans2.go <- enrichGO(gene = gene_symbols, ont = "MF", OrgDb ="org.Hs.eg.db", readable=TRUE, pvalueCutoff = 0.05) tab2.go <- as.data.frame(ans2.go) #Save the file write.csv(tab2.go, file = "GO_MF.csv", row.names = FALSE) ans3.go <- enrichGO(gene = gene_symbols, ont = "CC", OrgDb ="org.Hs.eg.db", readable=TRUE, pvalueCutoff = 0.05) tab3.go <- as.data.frame(ans3.go) #Save the file write.csv(tab3.go, file = "GO_CC.csv", row.names = FALSE) #---------------KEGG enrichment analysis----------------------- library(org.Hs.eg.db) library(clusterProfiler) options(download.file.method = "curl") suppressWarnings({ kegg_enrichment <- enrichKEGG(gene = gene_symbols, organism = "hsa", pvalueCutoff = 0.05) }) kegg <- as.data.frame(kegg_enrichment) #Save the file write.csv(kegg, file = "KEGG_enrich.csv", row.names = FALSE)