parent-child關系資料框如下:
parent_id child_id
1 1 2
2 2 3
3 3 4
目標是實作以下目標,即先前資料框的擴展版本,其中所有后代(子代、孫代等)都分配給每個父代(包括父代/子代本身):
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
我的問題是:在 R 中實作這一目標的最快方法(或其中一種方法)是什么?
我已經嘗試過各種方法——從 for 回圈、SQL 遞回到使用igraph(如此處所述)。它們都相當慢,其中一些在處理大量組合時也容易崩潰。
下面是帶有sqldf和的示例igraph,在比上面稍大的資料幀上進行了基準測驗。
library(sqldf)
library(purrr)
library(dplyr)
library(igraph)
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id 1L
# SQL recursion
sqlQuery <- 'with recursive
dfDescendants (parent_id, child_id)
as
(select parent_id, child_id from df
union all
select d.parent_id, s.child_id from dfDescendants d
join df s
on d.child_id = s.parent_id)
select distinct parent_id, parent_id as child_id from dfDescendants
union
select distinct child_id as parent_id, child_id from dfDescendants
union
select * from dfDescendants;'
sqldf(sqlQuery)
# igraph with purrr
df_g = graph_from_data_frame(df, directed = TRUE)
map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id")
Benchmark (excl. query creation in sqldf and conversion to graph in igraph):
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
times = 5
)
# Unit: seconds
# expr min lq mean median uq max neval
# sqldf 7.815179 8.002836 8.113392 8.084038 8.315207 8.349701 5
# tidyigraph 5.784239 5.806539 5.883241 5.889171 5.964906 5.971350 5
uj5u.com熱心網友回復:
我們可以ego像下面這樣使用
g <- graph_from_data_frame(df)
setNames(
rev(
stack(
Map(
names,
setNames(
ego(g,
order = vcount(g),
mode = "out"
),
names(V(g))
)
)
)
),
names(df)
)
這使
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
基準測驗
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
ego = setNames(
rev(
stack(
Map(
names,
setNames(
ego(df_g,
order = vcount(df_g),
mode = "out"
),
names(V(df_g))
)
)
)
),
names(df)
),
times = 5
)
節目
Unit: milliseconds
expr min lq mean median uq max neval
sqldf 7156.2753 9072.155 9402.6904 9518.2796 10206.3683 11060.3738 5
tidyigraph 2483.9943 2623.558 3136.7490 2689.8388 2879.5688 5006.7853 5
ego 182.5941 219.151 307.2481 253.2171 325.8721 555.4064 5
uj5u.com熱心網友回復:
igraph 當然是回答圖形問題的好方法(另請參閱 Bioconductor 的圖形 RBGL包),但我認為這在 R 中有一個迭代解決方案。
似乎一種合理的方法是執行深度優先的圖遍歷(我期待一個更好的解決方案)。這實際上很容易在 R 中有效實作。假設向量pid并cid描述圖中父節點和子節點之間的鏈接(如問題中的 data.frame 中所示)。將每個節點表示為一個正整數。
all_nodes <- unique(c(parent_id, child_id) # all nodes
uid <- match(all_nodes, all_nodes)
pid <- match(parent_id, all_nodes)
cid <- match(child_id, all_nodes)
并形成從每個節點到其所有子節點的邊串列。
edge_list <- unname(split(cid, factor(pid, levels = uid)))
edge_lengths <- lengths(edge_list)
當前子節點的子節點為edge_list[cid],每個原始父節點關聯的二級子節點數量為rep(pid, edge_lengths[cid])。所以從任何節點到任何其他可達節點的路徑都被簡單迭代遍歷
while (length(pid)) {
pid <- rep(pid, edge_lengths[cid])
cid <- unlist(edge_list[cid])
}
@jblood94 指出遍歷必須跟蹤已經訪問過的邊。我們可以通過創建訪問邊的邏輯向量來有效地實作這一點(在時間上,而不是空間上!)。我們使用“工廠”模式,我們創建一個保留狀態的函式(訪問的節點的邏輯向量)。key該向量由邊緣的唯一 id() 索引pid * n cid。我們對不重復且尚未訪問過的鍵感興趣。
visitor <- function(uid, n_max = 3000) {
n <- length(uid)
if (n <= n_max) {
## over-allocate, to support `key = pid * n cid`
visited <- logical((n 1L) * n) # FALSE on construction
} else {
stop("length(uid) greater than n_max = ", n_max)
}
function(pid, cid) {
key <- pid * n cid
to_visit <- !(duplicated(key) | visited[key])
visited[key[to_visit]] <<- TRUE # update nodes that we will now visit
to_visit
}
}
因此
> visit = visitor(1:10)
> visit(1:3, 2:4)
[1] TRUE TRUE TRUE
> visit(2:4, 3:5)
[1] FALSE FALSE TRUE
這是整個解決方案的更完整實作,帶有額外的簿記
visitor <- function(uid, n_max = 3000) {
n <- length(uid)
if (n <= n_max) {
## over-allocate, to support `key = pid * n cid`
visited <- logical((n 1L) * n) # FALSE on construction
} else {
stop("length(uid) greater than n_max = ", n_max)
}
function(pid, cid) {
key <- pid * n cid
to_visit <- !(duplicated(key) | visited[key])
visited[key[to_visit]] <<- TRUE
to_visit
}
}
ancestor_descendant <- function(df) {
## encode parent and child to unique integer values
ids <- unique(c(df$parent_id, df$child_id))
uid <- match(ids, ids)
pid <- match(df$parent_id, ids)
cid <- match(df$child_id, ids)
n <- length(uid)
## edge list of parent-offspring relationships, based on unique
## integer values; list is ordered by id, all ids are present, ids
## without children have zero-length elements. Use `unname()` so
## that edge_list is always indexed by integer
edge_list <- unname(split(cid, factor(pid, levels = uid), drop = FALSE))
edge_lengths <- lengths(edge_list)
visit <- visitor(uid)
keep <- visit(uid, uid) # all TRUE
aid = did = list(uid) # results -- all uid's are there own ancestor / descendant
i = 1L
while (length(pid)) {
## only add new edges
keep <- visit(pid, cid)
## record current generation ancestors and descendants
pid <- pid[keep]
cid <- cid[keep]
i <- i 1L
aid[[i]] <- pid
did[[i]] <- cid
## calculate next generation pid and cid.
pid <- rep(pid, edge_lengths[cid])
cid <- unlist(edge_list[cid])
}
## decode results to original ids and clean up return value
df <- data.frame(
ancestor_id = ids[unlist(aid)],
descendant_id = ids[unlist(did)]
)
df <- df[order(df$ancestor_id, df$descendant_id),]
rownames(df) <- NULL
df
}
This seems to be correct and performant, at least superficially
## Original example
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id 1L
df = df[sample(nrow(df)),]
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.243 0.001 0.245
dim(result)
## [1] 501501 2
## updated example from comments
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id 1L
df <- rbind(df, data.frame(parent_id = 1000L, child_id = 1002L))
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.195 0.001 0.195
dim(result)
## [1] 502502 2
## problematic case from @jblood94
df <- data.frame(
parent_id=c(1, 1, 2),
child_id = c(2, 3, 3)
)
ancestor_descendant(df)
## ancestor_id descendant_id
## 1 1 1
## 2 1 2
## 3 1 3
## 4 2 2
## 5 2 3
## 6 3 3
## previously failed without filtering re-visited nodes
df <- data.frame(
parent_id = rep(1:100, each = 2),
child_id = c(2, rep(3:101, each = 2), 102)
)
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.005 0.000 0.006
dim(result)
## [1] 5252 2
轉載請註明出處,本文鏈接:https://www.uj5u.com/qiye/426863.html
標籤:r performance recursion graph-theory igraph
