-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmsigdbr_id_converter.R
More file actions
77 lines (64 loc) · 2.16 KB
/
msigdbr_id_converter.R
File metadata and controls
77 lines (64 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
library(msigdbr)
library(GO.db)
library(AnnotationDbi)
library(dplyr)
library(stringr)
library(readr)
dir.create("data", showWarnings = FALSE)
org <- "Homo sapiens"
# Step 1: Lookup table for GO term → GO:ID
go_term_lookup <- AnnotationDbi::select(GO.db, keys = keys(GO.db),
columns = c("GOID", "TERM"), keytype = "GOID") %>%
mutate(term = tolower(TERM)) %>%
dplyr::select(go_id = GOID, term)
# Step 2: Retrieve MSigDB gene sets
databases <- list(
GO = msigdbr(species = org, category = "C5"),
KEGG = msigdbr(species = org, category = "C2") %>% filter(gs_subcat %in% c("CP:KEGG_LEGACY", "CP:KEGG_MEDICUS")),
Reactome = msigdbr(species = org, category = "C2") %>% filter(gs_subcat == "CP:REACTOME"),
BioCarta = msigdbr(species = org, category = "C2") %>% filter(gs_subcat == "CP:BIOCARTA")
)
# Step 3: Mapping function for GO
map_gs_name_to_go <- function(gs_name_vector) {
cleaned <- gs_name_vector %>%
str_remove("^GOBP_|^GOMF_|^GOCC_") %>%
str_replace_all("_", " ") %>%
tolower()
matched_go_ids <- sapply(cleaned, function(term) {
idx <- match(term, go_term_lookup$term)
if (!is.na(idx)) go_term_lookup$go_id[idx] else NA
})
matched_go_ids
}
# Step 4: ID extractors for KEGG, Reactome
extract_external_id <- function(df, db) {
case_when(
db == "KEGG" ~ str_extract(df$gs_url, "hsa\\d{5}"),
db == "Reactome" ~ str_extract(df$gs_url, "R-HSA-\\d+"),
db == "BioCarta" ~ NA_character_,
TRUE ~ NA_character_
)
}
# Step 5: Build and bind all
converter_list <- lapply(names(databases), function(db) {
df <- databases[[db]] %>%
distinct(msigdbr_id = gs_id, gs_name, gs_description, gs_url)
if (db == "GO") {
df <- df %>%
mutate(
external_id = map_gs_name_to_go(gs_name),
db = db
)
} else {
df <- df %>%
mutate(
external_id = extract_external_id(., db),
db = db
)
}
df %>% filter(!is.na(external_id))
})
# Step 6: Combine and write
converter_df <- bind_rows(converter_list) %>%
distinct(external_id, msigdbr_id, db, gs_name, gs_description)
write_tsv(converter_df, "data/msigdbr_id_converter.txt")