Rreshape()非常慢-有解無憂

我需要對資料從長到寬執行簡單的重塑，這需要在基礎 R 中作業。對于這個用例，reshape() 似乎非常慢（盡管斷言它非常快https://stackoverflow。 com/a/12073077/3017280）。這個例子是我的資料的合理近似。我知道在這個例子中我不需要兩個索引列，但我在真實資料中需要。在我的筆記本電腦上，10,000 行需要 3 秒，40,000 行需要 200 多秒。真實資料有超過一百萬行，所以 reshape() 顯然是一個非初學者。任何人都可以解釋為什么在這種情況下需要這么長時間？我使用 split/lapply/Reduce merge 解決了這個問題，這很笨拙但速度更快。

n <- 5000
dfLong <- data.frame(Index1 = rep(sample(1E6:2E6, n), 4),
                  Index2 = rep(sample(3E6:4E6, n), 4),
                  Key = rep(1:4, each = n),
                  Date = sample(seq.Date(as.Date("2020-01-01"),
                                         as.Date("2021-12-31"), 
                                         by = "1 day"),
                                size = n * 4, replace = TRUE),
                  Score = sample(0:48, n * 4, replace = TRUE))
                                
system.time(dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

uj5u.com熱心網友回復：

如果您查看reshape使用 profvis 包呼叫的函式，您會發現幾乎所有花費的總時間都在函式中的這一行上。該interaction函式僅用于將您的兩個 id 列合并為一個列。

data[, tempidname] <- interaction(data[, idvar], 
                drop = TRUE)

而不是interaction，您可以使用do.call(paste0, data[, idvar])。你可以使用一個函式來創建一個與interaction這個更快的函式相等的環境。

new_reshape <- function(...){
  interaction <- function(x, drop) do.call(paste0, x)
  environment(reshape) <- environment()
  reshape(...)
}

現在快多了

system.time(dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))
 #   user  system elapsed 
 # 35.292   0.538  36.236 

system.time(new_dfWide <- new_reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

  # user  system elapsed 
  # 0.071   0.009   0.081  

all.equal(new_dfWide, dfWide)
# [1] TRUE

使用plyr:::ninteraction. 這個函式唯一的非基礎依賴是plyr:::id_var，它沒有依賴，這意味著如果你不能安裝包，你可以很容易地復制粘貼這個函式定義（添加注釋以表示信用）。

new_reshape <- function(...){
  # interaction = plyr:::ninteraction
  # id_var = plyr:::id_var
  interaction <- 
    function (.variables, drop = FALSE) 
    {
        lengths <- vapply(.variables, length, integer(1))
        .variables <- .variables[lengths != 0]
        if (length(.variables) == 0) {
            n <- nrow(.variables) %||% 0L
            return(structure(seq_len(n), n = n))
        }
        if (length(.variables) == 1) {
            return(id_var(.variables[[1]], drop = drop))
        }
        ids <- rev(lapply(.variables, id_var, drop = drop))
        p <- length(ids)
        ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), 
            USE.NAMES = FALSE)
        n <- prod(ndistinct)
        if (n > 2^31) {
            char_id <- do.call("paste", c(ids, sep = "\r"))
            res <- match(char_id, unique(char_id))
        }
        else {
            combs <- c(1, cumprod(ndistinct[-p]))
            mat <- do.call("cbind", ids)
            res <- c((mat - 1L) %*% combs   1L)
        }
        attr(res, "n") <- n
        if (drop) {
            id_var(res, drop = TRUE)
        }
        else {
            structure(as.integer(res), n = attr(res, "n"))
        }
    }  
  id_var <- 
  function (x, drop = FALSE) 
  {
      if (length(x) == 0) 
          return(structure(integer(), n = 0L))
      if (!is.null(attr(x, "n")) && !drop) 
          return(x)
      if (is.factor(x) && !drop) {
          x <- addNA(x, ifany = TRUE)
          id <- as.integer(x)
          n <- length(levels(x))
      }
      else {
          levels <- sort(unique(x), na.last = TRUE)
          id <- match(x, levels)
          n <- max(id)
      }
      structure(id, n = n)
  }
  environment(reshape) <- environment()
  reshape(...)
}

system.time(new_dfWide <- new_reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

  #  user  system elapsed 
  # 0.015   0.000   0.015

uj5u.com熱心網友回復：

我不知道我曾經聲稱stats::reshape是最快的。

為了進行比較，stats::reshape在我的 i9/64GB-ram 系統上沒有那么快：

system.time(
dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide")
)
#    user  system elapsed 
#   19.63    0.03   19.73

但其他重塑功能做得更好：

system.time(
  tidyrWide <- pivot_wider(
    dfLong, c("Index1", "Index2"),
    names_prefix = "Q", names_from = "Key",
    values_from = c("Date", "Score"))
)
#    user  system elapsed 
#    0.01    0.00    0.02 

nms <- names(dfWide)
tidyrWide <- subset(tidyrWide, select = nms) # column order
dfOrder <- do.call(order, dfWide)
tidyrOrder <- do.call(order, tidyrWide)
all.equal(dfWide[dfOrder,], as.data.frame(tidyrWide)[tidyrOrder,], check.attributes = FALSE)
# [1] TRUE

同樣，data.table::dcast同樣快：

dtLong <- as.data.table(dfLong)
system.time(
  dtWide <- data.table::dcast(
    Index1   Index2 ~ paste0("Q", Key),
    data = dtLong, value.var = c("Date", "Score"))
)
#    user  system elapsed 
#    0.00    0.01    0.02 

dtWide <- subset(dtWide, select = nms) # column order
dtOrder <- do.call(order, dtWide)
all.equal(dfWide[dfOrder,nms], as.data.frame(dtWide)[dtOrder,nms], check.attributes = FALSE)
# [1] TRUE

uj5u.com熱心網友回復：

考慮使用基礎 R 的@Moody_Mudskipper 的matrix_spread的高級修改版本。由于matrix將簡化復雜型別，如Date，將需要一些臨時更改：

功能

matrix_spread <- function(df1, id, key, value, sep){
  unique_ids <-  unique(df1[[key]])
  mats <- lapply(df1[value], function(x) 
    matrix(x, ncol=length(unique_ids), byrow = FALSE)
  )
  df2 <- do.call(
    data.frame, list(unique(df1[id]), mats)
  )
  
  # RENAME COLS
  names(df2)[(length(id) 1):ncol(df2)] <- as.vector(
    sapply(value, function(x, y) paste0(x, sep, y), unique_ids)
  )
  # REORDER COLS
  df2 <- df2[c(id, as.vector(
    outer(c(value), unique_ids, function(x, y) paste0(x, sep, y))
  ))]
  
  return(df2)
}

應用

system.time(
  dfWide2 <- matrix_spread(
    df1 = dfLong, 
    id = c("Index1", "Index2"),
    key = "Key",
    value = c("Date", "Score"),
    sep = "_Q"
  )
)
#  user  system elapsed 
# 0.022   0.000   0.023 

# CONVERT INTEGERS TO DATES
dfWide2[grep("Date", names(dfWide2))] <- lapply(
  dfWide2[grep("Date", names(dfWide2))],
  as.Date,
  origin = "1970-01-01"
)

# REPLICATES OP'S reshape
identical(data.frame(dfWide), dfWide2)
# [1] TRUE

轉載請註明出處，本文鏈接：https://www.uj5u.com/net/403215.html

標籤：

上一篇：如何使用另一個資料框填充一個資料框，同時保留第一個資料框的NA

下一篇：使用ggplot2定義包的點大小的最佳方法是什么