假設我有以下資料集:
library(tidyr)
library(dplyr)
name1 <- c("John", "John", "John", "John", "John", "John", "John", "John", "John", "John", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary","Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "David", "David", "David", "David", "David", "David", "David", "David", "David", "David")
name2 <- c("Mary", "Thomas", "Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine", "Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael", "Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe", "Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda", "David", "Robert", "Holly", "Michael", "James", "Renee", "Sally", "Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine", "Cory", "Victoria")
df <- data.frame(name1, name2)
我希望能夠計算“name1”中的每個值在“name2”中具有相同 3 個值的次數。例如,John (name1) 有 Linda、David 和 Joe (name2),而 Mary (name 1) 也有 Linda、David 和 Joe (name2)。因此,如果我們只看 John 和 Mary,則所有 3 個名字都在一個組中的次數是 2。我想制作一個表格,顯示 name2 中三個名字出現在 name1 中的次數。
我知道如何為對撰寫函式:
count_pairs <-
function(df) {
df %>%
apply(1, sort) %>%
t() %>%
data.frame() %>%
group_by_all() %>%
count(name = "Occurrences_Pair")
}
df_pairs <- df %>% count_pairs()
但是對于 3 人、4 人、5 人等的團體,我該怎么做呢?
uj5u.com熱心網友回復:
count_groups_of <- function(df, n) {
varname <- as.symbol(paste0('combs_of_', n))
df %>%
group_by(name1) %>%
summarise(!!varname := combn(name2, n, function(x) toString(sort(x))), .groups = 'drop') %>%
group_by(!!varname) %>%
summarise(n = n(), which_name1 = toString(name1), .groups = 'drop') %>%
arrange(-n)
}
library(dplyr, warn.conflicts = FALSE)
df %>%
count_groups_of(3)
#> # A tibble: 556 × 3
#> combs_of_3 n which_name1
#> <chr> <int> <chr>
#> 1 David, Joe, Linda 3 Anne, John, Mary
#> 2 David, John, Linda 3 Anne, Joe, Mary
#> 3 David, Linda, Mary 3 Anne, Joe, John
#> 4 Joe, John, Linda 3 Anne, David, Mary
#> 5 Joe, Linda, Mary 3 Anne, David, John
#> 6 John, Linda, Mary 3 Anne, David, Joe
#> 7 Cory, Elaine, Joe 2 David, John
#> 8 Cory, Elaine, Linda 2 David, John
#> 9 Cory, Elaine, Mary 2 David, John
#> 10 Cory, Elaine, Victoria 2 David, John
#> # … with 546 more rows
df %>%
count_groups_of(4)
#> # A tibble: 1,026 × 3
#> combs_of_4 n which_name1
#> <chr> <int> <chr>
#> 1 Cory, Elaine, Joe, Linda 2 David, John
#> 2 Cory, Elaine, Joe, Mary 2 David, John
#> 3 Cory, Elaine, Joe, Victoria 2 David, John
#> 4 Cory, Elaine, Linda, Mary 2 David, John
#> 5 Cory, Elaine, Linda, Victoria 2 David, John
#> 6 Cory, Elaine, Mary, Victoria 2 David, John
#> 7 Cory, Joe, Linda, Mary 2 David, John
#> 8 Cory, Joe, Linda, Victoria 2 David, John
#> 9 Cory, Joe, Mary, Victoria 2 David, John
#> 10 Cory, Linda, Mary, Victoria 2 David, John
#> # … with 1,016 more rows
由reprex 包于 2022-01-10 創建(v2.0.1)
uj5u.com熱心網友回復:
這是一種使用dplyr. 例如,這表示 John 和 Mary 共享 3 個名字,而 John 和 David 共享 6 個。
首先,我將所有行與匹配 on 的其他行連接起來name2,然后過濾掉匹配name1(所有將與自己匹配 100%),保持不同的匹配,并計算每個 name1 和另一個之間有多少。
left_join(mutate(df, val = 1),
mutate(df, val = 1), by = c("val", "name2")) %>%
filter(name1.x != name1.y) %>%
distinct(name2, name1.x, name1.y) %>%
count(name1.x, name1.y) %>%
arrange(n)
結果
name1.x name1.y n
1 David Joe 3
2 David Mary 3
3 Joe David 3
4 Joe John 3
5 John Joe 3
6 John Mary 3
7 Mary David 3
8 Mary John 3
9 Anne David 4
10 Anne Joe 4
11 Anne John 4
12 Anne Mary 4
13 David Anne 4
14 Joe Anne 4
15 John Anne 4
16 Mary Anne 4
17 Joe Mary 5
18 Mary Joe 5
19 David John 6
20 John David 6
或將最后一行替換為以下行以生成重合表:
complete(name1.x, name1.y, fill = list(n = 0)) %>% # skip if order doesn't matter
pivot_wider(names_from = name1.y, values_from = n, values_fill = 0)
# A tibble: 5 x 6
name1.x Anne David Joe John Mary
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Anne 0 4 4 4 4
2 David 4 0 3 6 3
3 Joe 4 3 0 3 5
4 John 4 6 3 0 3
5 Mary 4 3 5 3 0
uj5u.com熱心網友回復:
這是一個使用RcppAlgos::comboGeneral.
nm <- sort(unique(df$name2)) ## unique names
M <- matrix(df$name2, 10) ## make a matrix, nrow acc. to name 1 groups
f <- \(n) RcppAlgos::comboGeneral(
nm, n, FUN=\(x) c(x, n=sum(colSums(array(M %in% x, dim=dim(M))) == n))) |>
do.call(what=rbind) |> as.data.frame() |> type.convert(as.is=TRUE) |>
(\(.) .[order(-.$n), ])() |> `rownames<-`(NULL)
給
head(f(2))
# V1 V2 n
# 1 David Linda 4
# 2 Joe Linda 4
# 3 John Linda 4
# 4 Linda Mary 4
# 5 David Joe 3
# 6 David John 3
head(f(3))
# V1 V2 V3 n
# 1 David Joe Linda 3
# 2 David John Linda 3
# 3 David Linda Mary 3
# 4 Joe John Linda 3
# 5 Joe Linda Mary 3
# 6 John Linda Mary 3
head(f(4))
# V1 V2 V3 V4 n
# 1 Cory Elaine Joe Linda 2
# 2 Cory Elaine Joe Mary 2
# 3 Cory Elaine Joe Victoria 2
# 4 Cory Elaine Linda Mary 2
# 5 Cory Elaine Linda Victoria 2
# 6 Cory Elaine Mary Victoria 2
head(f(5))
# V1 V2 V3 V4 V5 n
# 1 Cory Elaine Joe Linda Mary 2
# 2 Cory Elaine Joe Linda Victoria 2
# 3 Cory Elaine Joe Mary Victoria 2
# 4 Cory Elaine Linda Mary Victoria 2
# 5 Cory Joe Linda Mary Victoria 2
# 6 David Holly John Linda Michael 2
head(f(6))
# V1 V2 V3 V4 V5 V6 n
# 1 Cory Elaine Joe Linda Mary Victoria 2
# 2 Andy David Dwight Holly Jim Joe 1
# 3 Andy David Dwight Holly Jim John 1
# 4 Andy David Dwight Holly Jim June 1
# 5 Andy David Dwight Holly Jim Linda 1
# 6 Andy David Dwight Holly Jim Michael 1
注意:使用R >= 4.1。
資料:
df <- structure(list(name1 = c("John", "John", "John", "John", "John",
"John", "John", "John", "John", "John", "Mary", "Mary", "Mary",
"Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Anne",
"Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne",
"Anne", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe",
"Joe", "Joe", "David", "David", "David", "David", "David", "David",
"David", "David", "David", "David"), name2 = c("Mary", "Thomas",
"Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine",
"Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael",
"Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe",
"Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda",
"David", "Robert", "Holly", "Michael", "James", "Renee", "Sally",
"Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine",
"Cory", "Victoria")), class = "data.frame", row.names = c(NA,
-50L))
uj5u.com熱心網友回復:
這是一個解決方案,用于intersect()報告每個成對name1值組合的重疊并報告重疊的大小和內容。
library(tidyverse)
name1 <- c("John", "John", "John", "John", "John", "John", "John", "John", "John", "John", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary","Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "David", "David", "David", "David", "David", "David", "David", "David", "David", "David")
name2 <- c("Mary", "Thomas", "Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine", "Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael", "Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe", "Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda", "David", "Robert", "Holly", "Michael", "James", "Renee", "Sally", "Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine", "Cory", "Victoria")
df <- data.frame(name1, name2)
df %>%
group_by(name1) %>%
group_split() %>%
set_names(unique(name1)) %>%
map(., ~ pull(.x, name2)) %>%
combn(., 2, function(x)
intersect(x[[1]], x[[2]]), simplify = F) %>%
set_names(combn(unique(df$name1), 2, function(x) {
paste(x[[1]], x[[2]], sep = "_")
})) %>%
map_dfc(., ~ toString(.x)) %>%
pivot_longer(everything(), names_to = "combo_names", values_to = "intersection") %>%
separate(combo_names,
into = c("name_a", "name_b"),
sep = "_") %>%
mutate(intersection_size = str_count(intersection, ", ") 1, .after = name_b) %>%
arrange(intersection_size)
#> # A tibble: 10 x 4
#> name_a name_b intersection_size intersection
#> <chr> <chr> <dbl> <chr>
#> 1 Mary Anne 3 Mary, John, Linda
#> 2 Mary David 3 John, Linda, Joe
#> 3 Anne Joe 3 Mary, Linda, David
#> 4 Joe David 3 Linda, David, Joe
#> 5 John Mary 4 Mary, John, Linda, Joe
#> 6 John Anne 4 Mary, John, Linda, David
#> 7 John Joe 4 Mary, Linda, David, Joe
#> 8 John David 4 John, Linda, David, Joe
#> 9 Anne David 5 John, Linda, David, Holly, Michael
#> 10 Mary Joe 6 Mary, Linda, Joe, Elaine, Cory, Victoria
由reprex 包于 2022-01-11 創建(v2.0.1)
轉載請註明出處,本文鏈接:https://www.uj5u.com/yidong/408944.html
標籤:
