我需要對資料從長到寬執行簡單的重塑,這需要在基礎 R 中作業。對于這個用例,reshape() 似乎非常慢(盡管斷言它非常快https://stackoverflow。 com/a/12073077/3017280)。這個例子是我的資料的合理近似。我知道在這個例子中我不需要兩個索引列,但我在真實資料中需要。在我的筆記本電腦上,10,000 行需要 3 秒,40,000 行需要 200 多秒。真實資料有超過一百萬行,所以 reshape() 顯然是一個非初學者。任何人都可以解釋為什么在這種情況下需要這么長時間?我使用 split/lapply/Reduce merge 解決了這個問題,這很笨拙但速度更快。
n <- 5000
dfLong <- data.frame(Index1 = rep(sample(1E6:2E6, n), 4),
Index2 = rep(sample(3E6:4E6, n), 4),
Key = rep(1:4, each = n),
Date = sample(seq.Date(as.Date("2020-01-01"),
as.Date("2021-12-31"),
by = "1 day"),
size = n * 4, replace = TRUE),
Score = sample(0:48, n * 4, replace = TRUE))
system.time(dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
uj5u.com熱心網友回復:
如果您查看reshape使用 profvis 包呼叫的函式,您會發現幾乎所有花費的總時間都在函式中的這一行上。該interaction函式僅用于將您的兩個 id 列合并為一個列。
data[, tempidname] <- interaction(data[, idvar],
drop = TRUE)
而不是interaction,您可以使用do.call(paste0, data[, idvar])。你可以使用一個函式來創建一個與interaction這個更快的函式相等的環境。
new_reshape <- function(...){
interaction <- function(x, drop) do.call(paste0, x)
environment(reshape) <- environment()
reshape(...)
}
現在快多了
system.time(dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 35.292 0.538 36.236
system.time(new_dfWide <- new_reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 0.071 0.009 0.081
all.equal(new_dfWide, dfWide)
# [1] TRUE
使用plyr:::ninteraction. 這個函式唯一的非基礎依賴是plyr:::id_var,它沒有依賴,這意味著如果你不能安裝包,你可以很容易地復制粘貼這個函式定義(添加注釋以表示信用)。
new_reshape <- function(...){
# interaction = plyr:::ninteraction
# id_var = plyr:::id_var
interaction <-
function (.variables, drop = FALSE)
{
lengths <- vapply(.variables, length, integer(1))
.variables <- .variables[lengths != 0]
if (length(.variables) == 0) {
n <- nrow(.variables) %||% 0L
return(structure(seq_len(n), n = n))
}
if (length(.variables) == 1) {
return(id_var(.variables[[1]], drop = drop))
}
ids <- rev(lapply(.variables, id_var, drop = drop))
p <- length(ids)
ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1),
USE.NAMES = FALSE)
n <- prod(ndistinct)
if (n > 2^31) {
char_id <- do.call("paste", c(ids, sep = "\r"))
res <- match(char_id, unique(char_id))
}
else {
combs <- c(1, cumprod(ndistinct[-p]))
mat <- do.call("cbind", ids)
res <- c((mat - 1L) %*% combs 1L)
}
attr(res, "n") <- n
if (drop) {
id_var(res, drop = TRUE)
}
else {
structure(as.integer(res), n = attr(res, "n"))
}
}
id_var <-
function (x, drop = FALSE)
{
if (length(x) == 0)
return(structure(integer(), n = 0L))
if (!is.null(attr(x, "n")) && !drop)
return(x)
if (is.factor(x) && !drop) {
x <- addNA(x, ifany = TRUE)
id <- as.integer(x)
n <- length(levels(x))
}
else {
levels <- sort(unique(x), na.last = TRUE)
id <- match(x, levels)
n <- max(id)
}
structure(id, n = n)
}
environment(reshape) <- environment()
reshape(...)
}
system.time(new_dfWide <- new_reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 0.015 0.000 0.015
uj5u.com熱心網友回復:
我不知道我曾經聲稱stats::reshape是最快的。
為了進行比較,stats::reshape在我的 i9/64GB-ram 系統上沒有那么快:
system.time(
dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide")
)
# user system elapsed
# 19.63 0.03 19.73
但其他重塑功能做得更好:
system.time(
tidyrWide <- pivot_wider(
dfLong, c("Index1", "Index2"),
names_prefix = "Q", names_from = "Key",
values_from = c("Date", "Score"))
)
# user system elapsed
# 0.01 0.00 0.02
nms <- names(dfWide)
tidyrWide <- subset(tidyrWide, select = nms) # column order
dfOrder <- do.call(order, dfWide)
tidyrOrder <- do.call(order, tidyrWide)
all.equal(dfWide[dfOrder,], as.data.frame(tidyrWide)[tidyrOrder,], check.attributes = FALSE)
# [1] TRUE
同樣,data.table::dcast同樣快:
dtLong <- as.data.table(dfLong)
system.time(
dtWide <- data.table::dcast(
Index1 Index2 ~ paste0("Q", Key),
data = dtLong, value.var = c("Date", "Score"))
)
# user system elapsed
# 0.00 0.01 0.02
dtWide <- subset(dtWide, select = nms) # column order
dtOrder <- do.call(order, dtWide)
all.equal(dfWide[dfOrder,nms], as.data.frame(dtWide)[dtOrder,nms], check.attributes = FALSE)
# [1] TRUE
uj5u.com熱心網友回復:
考慮使用基礎 R 的@Moody_Mudskipper 的matrix_spread的高級修改版本。由于matrix將簡化復雜型別,如Date,將需要一些臨時更改:
功能
matrix_spread <- function(df1, id, key, value, sep){
unique_ids <- unique(df1[[key]])
mats <- lapply(df1[value], function(x)
matrix(x, ncol=length(unique_ids), byrow = FALSE)
)
df2 <- do.call(
data.frame, list(unique(df1[id]), mats)
)
# RENAME COLS
names(df2)[(length(id) 1):ncol(df2)] <- as.vector(
sapply(value, function(x, y) paste0(x, sep, y), unique_ids)
)
# REORDER COLS
df2 <- df2[c(id, as.vector(
outer(c(value), unique_ids, function(x, y) paste0(x, sep, y))
))]
return(df2)
}
應用
system.time(
dfWide2 <- matrix_spread(
df1 = dfLong,
id = c("Index1", "Index2"),
key = "Key",
value = c("Date", "Score"),
sep = "_Q"
)
)
# user system elapsed
# 0.022 0.000 0.023
# CONVERT INTEGERS TO DATES
dfWide2[grep("Date", names(dfWide2))] <- lapply(
dfWide2[grep("Date", names(dfWide2))],
as.Date,
origin = "1970-01-01"
)
# REPLICATES OP'S reshape
identical(data.frame(dfWide), dfWide2)
# [1] TRUE
轉載請註明出處,本文鏈接:https://www.uj5u.com/net/403215.html
標籤:
