setwd("C:/Users/S Thakur/OneDrive/Documents/microarray_tutorial/test")
install.packages("splitstackshape")
install.packages("dplyr")
install.packages("ggplot2")
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(c("GEOquery", "limma", "clusterProfiler", "affy", "oligo", "preprocessCore", "AnnotationDbi", "org.Hs.eg.db", "arrayQualityMetrics"))
#----------------------------------------------------------
source("https://bioconductor.org/bioclite.r")
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(c("oligo"),force = TRUE)
BiocManager::install(c("arrayQualityMetrics"))
 ------------------------------------------------------------- 
# Load required libraries
library(GEOquery)
library(limma)
library(clusterProfiler)
library(affy)
library(oligo)
library(clusterProfiler)
library(preprocessCore)
library(ggplot2)
library(splitstackshape)
library(dplyr)
library(AnnotationDbi)
library(org.Hs.eg.db)  
#Option1: Set GEO accession number
geo_accession <- "GSE24460"
# Download and load the data
gse <- getGEO(geo_accession)
data <- exprs(gse[[1]])
normalized_data <- normalize.quantiles(data)

#-------------------------------------------------------------------------------------
#Option2:Download cel files directly and make a similar expression matrix
library("arrayQualityMetrics")
celFiles <- list.celfiles()
affyraw <- read.celfiles(celFiles)
eset <- oligo::rma(affyraw)
write.exprs(eset, file = "exp.txt")

#Differential gene expression anslysis--------------------------------------------------
# Define groups 
group <- factor(c("parental", "parental","drugresistant","drugresistant"))

# Create a design matrix
design <- model.matrix(~0 + group)

# Rename the columns to match your group names
colnames(design) <- levels(group)

#Differential gene expression analysis 
fit <- lmFit(eset,design)
fit <- eBayes(fit)

# Define contrasts (comparing parental_cell to doxorubicin_selected_subline)
contrast_matrix <- makeContrasts(parental - drugresistant, levels = design)

results <- contrasts.fit(fit, contrast_matrix)
results <- eBayes(results)

# Set your desired p-value threshold 
p_value_threshold <- 0.05

# Set your desired log fold change cutoff
logFC_cutoff <- 2

# Extract significant genes with a p-value below the threshold and log fold change >= 2


sig_genes <- topTable(results, coef = 1, adjust.method = "fdr", number = Inf)

# Filter based on p-value and log fold change
sig_genes1 <- sig_genes[sig_genes$P.Value < p_value_threshold & abs(sig_genes$logFC) >= logFC_cutoff, ]

# Save significant genes to a CSV file
write.csv(sig_genes1, file = "significant_genes.csv")

#Annotation using family soft file and Heat map generation-----------------------------------------------------------
mydata <- read.delim("GSE24460_family.soft", check.names = FALSE)
arr <- data.frame(mydata)
aw <- cSplit(arr, "Gene.Symbol","//")
write.table(aw, file="ano.txt", sep = "\t", row.names = FALSE, quote = FALSE)
data1 <- read.delim("ano.txt", check.names = FALSE)
data2 <- read.csv("significant_genes.csv", header = TRUE, sep = ",")
combined <- inner_join(data1, data2, by = "ID")
write.csv(combined, "annotated.csv")
# Save the selected columns to a new CSV file
annotated_data <- read.csv("annotated.csv", header = TRUE)
selected_columns <- annotated_data[, c("ID", "GB_ACC", "ENTREZ_GENE_ID", "Gene.Symbol_01", "Gene.Title", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")]
df <- subset(selected_columns, Gene.Symbol_01 != "NA")
write.csv(df, file = "DGE_anno.csv", row.names = FALSE)

#---------------------------------GO annotation----------------------------------------------------------
# Read the "annotation.csv" file 
annotation_data <- read.csv("annotated.csv")

# Load the annotation database 
org_db <- org.Hs.eg.db  # For Homo sapiens

# Extract unique gene symbols from your annotation data
unique_gene_symbols <- unique(annotation_data$Gene.Symbol_01)

# Perform GO annotation lookup using gene symbols as the key
go_annotations <- select(org_db, 
                         keys = unique_gene_symbols, 
                         keytype = "SYMBOL", 
                         columns = c("SYMBOL", "GO","GENENAME","ENTREZID","PATH"),
                         multiVals = "list")

# Save the go_annotations to a new CSV file
write.csv(go_annotations, file = "annotated_data_with_GO.csv", row.names = FALSE)


#------------------------------------ GO enrichment annotation-------------------------------------


library(clusterProfiler)


# Extract the gene symbols from the annotation_data
gene_symbols <- annotation_data$ENTREZ_GENE_ID

# Perform KEGG pathway annotation with a p-value cutoff


ans.go <- enrichGO(gene = gene_symbols, ont = "BP",
                   OrgDb ="org.Hs.eg.db",
                   readable=TRUE,
                   pvalueCutoff = 0.05)

tab.go <- as.data.frame(ans.go)
#Save the file
write.csv(tab.go, file = "GO_BP.csv", row.names = FALSE)

ans2.go <- enrichGO(gene = gene_symbols, ont = "MF",
                    OrgDb ="org.Hs.eg.db",
                    readable=TRUE,
                    pvalueCutoff = 0.05)

tab2.go <- as.data.frame(ans2.go)

#Save the file
write.csv(tab2.go, file = "GO_MF.csv", row.names = FALSE)


ans3.go <- enrichGO(gene = gene_symbols, ont = "CC",
                    OrgDb ="org.Hs.eg.db",
                    readable=TRUE,
                    pvalueCutoff = 0.05)

tab3.go <- as.data.frame(ans3.go)

#Save the file
write.csv(tab3.go, file = "GO_CC.csv", row.names = FALSE)

#---------------KEGG enrichment analysis-----------------------
library(org.Hs.eg.db)
library(clusterProfiler)
options(download.file.method = "curl")
suppressWarnings({
  kegg_enrichment <- enrichKEGG(gene = gene_symbols, organism = "hsa", pvalueCutoff = 0.05)
})
kegg <- as.data.frame(kegg_enrichment)
#Save the file
write.csv(kegg, file = "KEGG_enrich.csv", row.names = FALSE)