目標查看我的行中的共同元素,這些元素在我的不同比較中基本上是基因名稱。
這是我試圖遵循的答案。
df1 = data.frame(genes = c('gene1', 'gene3', 'gene4', 'gene2'))
df2 = data.frame(genes = c('gene3', 'gene2', 'gene5', 'gene1', "genet"))
df3 = data.frame(genes = c('gene6', 'gene3', 'gene4', 'gdene7', 'genex', "gene10"))
dfList <- list(df1, df2, df3)
reduce(dfList, inner_join)
reduce(dfList, inner_join)
Joining, by = "genes"
Joining, by = "genes"
genes
1 gene3
在這種情況下失敗
df1 = data.frame(genes = c('gene1', 'gene3', 'gene4', 'gene2'))
df2 = data.frame(genes = c('gene3', 'gene2', 'gene5', 'gene1', "genet"))
df3 = data.frame(genes = c('gene6', 'gene13', 'gene4', 'gdene7', 'genex', "gene10"))
dfList <- list(df1, df2, df3)
reduce(dfList, inner_join)
educe(dfList, inner_join)
Joining, by = "genes"
Joining, by = "genes"
[1] genes
<0 rows> (or 0-length row.names)
現在如何解決這個問題。我給了一個我喜歡 15 比較的小集合。
Expected output
gene3 df1 df2 df3 ## for common genes
gene1 df1 df2 ## for genes which arr not across all the combination
gene2
在第一種情況下,解決方案的作業原理是在所有情況下都預設了基因 3,但當它僅在 2 個條件下存在時會失敗。
那么我如何找出所有可能的組合,其中基因以不同的可能組合存在。
例如,如果基因 3存在于所有三個中,則報告但基因 1 和基因 2 存在于df1并且df2未報告。
所以我想看看一組基因是否存在于所有條件下,這是最不可能的,但它存在的所有可能的組合
我的實際資料框是這樣命名的,它在一個串列中
names(result_abd)
[1] "M0_vs_M1_TCGA_stages" "M0_vs_M2_TCGA_stages" "M0_vs_M3_TCGA_stages" "M0_vs_M4_TCGA_stages" "M0_vs_M5_TCGA_stages" "M1_vs_M2_TCGA_stages"
[7] "M1_vs_M3_TCGA_stages" "M1_vs_M4_TCGA_stages" "M1_vs_M5_TCGA_stages" "M2_vs_M3_TCGA_stages" "M2_vs_M4_TCGA_stages" "M2_vs_M5_TCGA_stages"
[13] "M3_vs_M4_TCGA_stages" "M3_vs_M5_TCGA_stages" "M4_vs_M5_TCGA_stages"
>
所以我希望每個資料框有 15 列
我運行了你的代碼,輸出就是這樣
dput(head(a))
structure(list(gene = c("ENSG00000000003", "ENSG00000000971",
"ENSG00000002726", "ENSG00000003989", "ENSG00000005381", "ENSG00000006534"
), dfM0_vs_M1_TCGA_stages = c("M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages",
"M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages",
"M0_vs_M1_TCGA_stages"), dfM0_vs_M2_TCGA_stages = c(NA, "M0_vs_M2_TCGA_stages",
"M0_vs_M2_TCGA_stages", NA, "M0_vs_M2_TCGA_stages", NA), dfM0_vs_M3_TCGA_stages = c("M0_vs_M3_TCGA_stages",
"M0_vs_M3_TCGA_stages", "M0_vs_M3_TCGA_stages", NA, "M0_vs_M3_TCGA_stages",
NA), dfM0_vs_M4_TCGA_stages = c("M0_vs_M4_TCGA_stages", NA, "M0_vs_M4_TCGA_stages",
NA, "M0_vs_M4_TCGA_stages", "M0_vs_M4_TCGA_stages"), dfM0_vs_M5_TCGA_stages = c("M0_vs_M5_TCGA_stages",
NA, "M0_vs_M5_TCGA_stages", NA, "M0_vs_M5_TCGA_stages", "M0_vs_M5_TCGA_stages"
), dfM1_vs_M2_TCGA_stages = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), dfM1_vs_M3_TCGA_stages = c(NA,
NA, NA, NA, "M1_vs_M3_TCGA_stages", NA), dfM1_vs_M4_TCGA_stages = c(NA,
"M1_vs_M4_TCGA_stages", NA, NA, NA, NA), dfM1_vs_M5_TCGA_stages = c(NA,
NA, "M1_vs_M5_TCGA_stages", NA, NA, NA), dfM2_vs_M3_TCGA_stages = c(NA,
NA, NA, NA, "M2_vs_M3_TCGA_stages", NA), dfM2_vs_M4_TCGA_stages = c(NA,
"M2_vs_M4_TCGA_stages", NA, NA, NA, NA), dfM2_vs_M5_TCGA_stages = c(NA,
NA, "M2_vs_M5_TCGA_stages", NA, "M2_vs_M5_TCGA_stages", NA),
dfM3_vs_M4_TCGA_stages = c(NA, "M3_vs_M4_TCGA_stages", NA,
NA, "M3_vs_M4_TCGA_stages", NA), dfM3_vs_M5_TCGA_stages = c(NA,
"M3_vs_M5_TCGA_stages", NA, NA, "M3_vs_M5_TCGA_stages", NA
), dfM4_vs_M5_TCGA_stages = c(NA, NA, "M4_vs_M5_TCGA_stages",
NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
資料框格式
A tibble: 6 × 16
gene dfM0_vs_M1_TCGA… dfM0_vs_M2_TCGA… dfM0_vs_M3_TCGA… dfM0_vs_M4_TCGA… dfM0_vs_M5_TCGA… dfM1_vs_M2_TCGA… dfM1_vs_M3_TCGA… dfM1_vs_M4_TCGA…
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ENSG00000000003 M0_vs_M1_TCGA_s… NA M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
2 ENSG00000000971 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… NA NA NA NA M1_vs_M4_TCGA_s…
3 ENSG00000002726 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
4 ENSG00000003989 M0_vs_M1_TCGA_s… NA NA NA NA NA NA NA
5 ENSG00000005381 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA M1_vs_M3_TCGA_s… NA
6 ENSG00000006534 M0_vs_M1_TCGA_s… NA NA M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
現在這就是我想要的。下一步作為我想看的例子
如果我認為這個基因ENSG00000000971存在于 7 個比較中,但不存在于其他報告為的比較中NA。我該如何對它們進行分組。
就像用這些基因制作另一個資料框一樣,可以說存在于多重比較中,不包括在哪里NA
uj5u.com熱心網友回復:
我不清楚您希望我們的輸入如何格式化(幾個索引列或一個包含字串或串列列的列)。但這是一種選擇。我首先將您的資料串列組合到一個資料框中,并帶有一個指示來源的索引。
library(tidyverse)
dfList <- list(df1, df2, df3)
dfList %>%
bind_rows(.id="df") %>%
pivot_wider(names_from=df, names_prefix="df", values_from=df)
# A tibble: 10 × 4
genes df1 df2 df3
<chr> <chr> <chr> <chr>
1 gene1 1 2 NA
2 gene3 1 2 3
3 gene4 1 NA 3
4 gene2 1 2 NA
5 gene5 NA 2 NA
6 genet NA 2 NA
7 gene6 NA NA 3
8 gdene7 NA NA 3
9 genex NA NA 3
10 gene10 NA NA 3
補充回答以下OP的問題。(盡管注意這實際上是一個新問題,應該是一個新帖子。)
dfList %>%
bind_rows(.id="df") %>%
group_by(genes) %>%
summarise(minDF=min(df), maxDF=max(df)) %>%
filter(minDF == maxDF & maxDF == 3) %>%
pull(genes)
[1] "gdene7" "gene10" "gene6" "genex"
再一次,關鍵是將所有資料放入一個資料框中。(并且輸出的所需格式不清楚。)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qukuanlian/488577.html
標籤:r
