R-計算組內所有觀察組合之間的差異-有解無憂

我想比較不同施肥劑量對不同地點的多種作物品種的影響。我的資料集類似于下面生成的資料集：

locs <- rep(c("loc1","loc2","loc3"), length.out=180)
cults <- rep(c("cult1","cult2","cult3","cult4","cult5"), length.out=180)
doses <- rep(c("no_fert","40kg","50kg","60kg"), length.out=180)
set.seed(123); yld <- runif(3*length(unique(locs))*length(unique(cults))*length(unique(doses)), min=3, max=15)

dat <- data.frame(location=locs,
                  cultivar=cults,
                  fert_dose=doses,
                  yield=yld)

請注意，每種肥料劑量有 3 次重復（但在我的實際資料集中還有更多）。

我需要做的第一件事是計算每個位置-品種-肥料組合的三次重復的平均值。

我可以做到——以一種可能不太有效的方式——像這樣：

d1 <- d2 <- d3 <- list()
for (i in 1:length(unique(locs))){
  for (j in 1:length(unique(cults))){
    for (k in 1:length(unique(doses))){
      d1[[k]] <- data.frame(location=locs[i],
                            cultivar=cults[j],
                            fert_dose=doses[k],
                            mean_yield=mean(dat[dat$location==locs[i]&dat$cultivar==cults[j]&dat$fert_dose==doses[k],]$yield))
    }
    d2[[j]] <- do.call(rbind,d1)
  }
  d3[[i]] <- do.call(rbind,d2)
}

(mean_dat <- do.call(rbind, d3))

接下來，我需要做的是：對于每個位置，找出所有品種和肥料劑量組合之間的產量差異。

例如，僅考慮loc1和cult1，預期結果將是：

res <- "
location cultivar dose dose_mean other_cultivar other_dose other_mean diff
loc1 cult1 no_fert 9.402345 cult1 40kg 9.251377 0.150968
loc1 cult1 no_fert 9.402345 cult1 50kg 10.764692 -1.362347
loc1 cult1 no_fert 9.402345 cult1 60kg 10.119129 -0.716784

loc1 cult1 40kg 9.251377 cult1 no_fert 9.402345 -0.150968
loc1 cult1 40kg 9.251377 cult1 50kg 10.764692 -1.513315
loc1 cult1 40kg 9.251377 cult1 60kg 10.119129 -0.867752

loc1 cult1 50kg 10.764692 cult1 no_fert 9.402345 1.362347
loc1 cult1 50kg 10.764692 cult1 40kg 9.251377 1.513315
loc1 cult1 50kg 10.764692 cult1 60kg 10.119129 0.645563

loc1 cult1 60kg 10.119129 cult1 no_fert 9.402345 0.716784
loc1 cult1 60kg 10.119129 cult1 40kg 9.251377 0.867752
loc1 cult1 60kg 10.119129 cult1 50kg 10.764692 -0.645563
"
(res <- read.table(textConnection(res), sep=" ", header=T, stringsAsFactors=F))

在此表中，我重復上一步（mean_dat表）中獲得的每個劑量的屈服值，并計算它們之間的簡單差異。結果表將繼續此分析，包括other_cultivar列中的其他品種。

我認為預期的表格看起來不太好，但它將用于提供互動式儀表板，這是它需要的格式，所以我認為我在這里沒有太多選擇。

是否有任何程式化方式可以一步實作這兩個結果？

uj5u.com熱心網友回復：

dplyr

library(dplyr)
dat %>%
  group_by(location, cultivar, dose = fert_dose) %>%
  summarize(dose_mean = mean(yield), .groups = "drop") %>%
  full_join(., ., by = "location", suffix = c("", "_other")) %>%
  filter(cultivar != cultivar_other | dose != dose_other) %>%
  mutate(diff = dose_mean - dose_mean_other)
# # A tibble: 1,140 x 8
#    location cultivar dose  dose_mean cultivar_other dose_other dose_mean_other     diff
#    <chr>    <chr>    <chr>     <dbl> <chr>          <chr>                <dbl>    <dbl>
#  1 loc1     cult1    40kg       9.25 cult1          50kg                 10.8  -1.51   
#  2 loc1     cult1    40kg       9.25 cult1          60kg                 10.1  -0.868  
#  3 loc1     cult1    40kg       9.25 cult1          no_fert               9.40 -0.151  
#  4 loc1     cult1    40kg       9.25 cult2          40kg                 10.1  -0.830  
#  5 loc1     cult1    40kg       9.25 cult2          50kg                  8.97  0.282  
#  6 loc1     cult1    40kg       9.25 cult2          60kg                  6.71  2.54   
#  7 loc1     cult1    40kg       9.25 cult2          no_fert              11.5  -2.20   
#  8 loc1     cult1    40kg       9.25 cult3          40kg                 11.9  -2.70   
#  9 loc1     cult1    40kg       9.25 cult3          50kg                  9.21  0.0421 
# 10 loc1     cult1    40kg       9.25 cult3          60kg                  9.26 -0.00416
# # ... with 1,130 more rows

cultivar請注意，這是對and進行外部連接dose。我們從 180 行開始，到 1140 行結束，這將呈幾何級數增長。

資料表

library(data.table)
DT <- as.data.table(dat)[, .(dose_mean = mean(yield)), by = .(location, cultivar, dose = fert_dose)]
merge(DT, DT, by = "location", all = TRUE, suffix = c("", "_other"), allow.cartesian = TRUE
  )[(cultivar != cultivar_other | dose != dose_other),
  ][, diff := dose_mean - dose_mean_other][]
#       location cultivar    dose dose_mean cultivar_other dose_other dose_mean_other         diff
#         <char>   <char>  <char>     <num>         <char>     <char>           <num>        <num>
#    1:     loc1    cult1 no_fert  9.402345          cult4       60kg        8.508675  0.893670057
#    2:     loc1    cult1 no_fert  9.402345          cult2       50kg        8.969489  0.432856209
#    3:     loc1    cult1 no_fert  9.402345          cult5       40kg        9.345814  0.056530679
#    4:     loc1    cult1 no_fert  9.402345          cult3    no_fert       11.243009 -1.840663741
#    5:     loc1    cult1 no_fert  9.402345          cult1       60kg       10.119129 -0.716784445
#    6:     loc1    cult1 no_fert  9.402345          cult4       50kg        9.638162 -0.235817407
#    7:     loc1    cult1 no_fert  9.402345          cult2       40kg       10.081336 -0.678991009
#    8:     loc1    cult1 no_fert  9.402345          cult5    no_fert        9.405199 -0.002854273
#    9:     loc1    cult1 no_fert  9.402345          cult3       60kg        9.255537  0.146807576
#   10:     loc1    cult1 no_fert  9.402345          cult1       50kg       10.764692 -1.362347580
#   ---                                                                                           
# 1131:     loc3    cult5    60kg  8.442893          cult5       40kg        7.217206  1.225686617
# 1132:     loc3    cult5    60kg  8.442893          cult3    no_fert        8.688523 -0.245630492
# 1133:     loc3    cult5    60kg  8.442893          cult1       60kg        7.221926  1.220966527
# 1134:     loc3    cult5    60kg  8.442893          cult4       50kg        7.918912  0.523980425
# 1135:     loc3    cult5    60kg  8.442893          cult2       40kg        7.405098  1.037794838
# 1136:     loc3    cult5    60kg  8.442893          cult5    no_fert        6.963170  1.479722527
# 1137:     loc3    cult5    60kg  8.442893          cult3       60kg        8.183201  0.259691148
# 1138:     loc3    cult5    60kg  8.442893          cult1       50kg        9.444416 -1.001523464
# 1139:     loc3    cult5    60kg  8.442893          cult4       40kg       10.264777 -1.821884187
# 1140:     loc3    cult5    60kg  8.442893          cult2    no_fert        7.196217  1.246675164

請注意，這樣做data.table效果很好，但并沒有真正減少就地計算的記憶體占用或通常歸因于data.table基于 - 的解決方案的速度。

uj5u.com熱心網友回復：

使用data.table，您可以執行以下連接：

library(data.table)

locs <- rep(c("loc1","loc2","loc3"), length.out=180)
cults <- rep(c("cult1","cult2","cult3","cult4","cult5"), length.out=180)
doses <- rep(c("no_fert","40kg","50kg","60kg"), length.out=180)
set.seed(123); yld <- runif(3*length(unique(locs))*length(unique(cults))*length(unique(doses)), min=3, max=15)

dat <- data.frame(location=locs,
                  cultivar=cults,
                  fert_dose=doses,
                  yield=yld)


setDT(dat)

dat[dat, .(cultivar_1 = cultivar, 
           cultivar_2 = i.cultivar,
           fert_dose_1 = fert_dose,
           fert_dose_2 = i.fert_dose,
           yield_1 = yield,
           yield_2 = i.yield,
           diff = yield - i.yield), on = "location", by = .EACHI][
           !(cultivar_1 == cultivar_2 & fert_dose_1 == fert_dose_2)][
             order(location, cultivar_1,fert_dose_1, cultivar_2, fert_dose_2)]

#>        location cultivar_1 cultivar_2 fert_dose_1 fert_dose_2   yield_1
#>     1:     loc1      cult1      cult1        40kg        50kg  4.665673
#>     2:     loc1      cult1      cult1        40kg        50kg 13.684203
#>     3:     loc1      cult1      cult1        40kg        50kg  9.404255
#>     4:     loc1      cult1      cult1        40kg        50kg  4.665673
#>     5:     loc1      cult1      cult1        40kg        50kg 13.684203
#>    ---                                                                 
#> 10256:     loc3      cult5      cult5     no_fert        60kg  8.794829
#> 10257:     loc3      cult5      cult5     no_fert        60kg  7.265345
#> 10258:     loc3      cult5      cult5     no_fert        60kg  4.829337
#> 10259:     loc3      cult5      cult5     no_fert        60kg  8.794829
#> 10260:     loc3      cult5      cult5     no_fert        60kg  7.265345
#>          yield_2        diff
#>     1: 14.556291 -9.89061803
#>     2: 14.556291 -0.87208812
#>     3: 14.556291 -5.15203544
#>     4:  4.568348  0.09732446
#>     5:  4.568348  9.11585437
#>    ---                      
#> 10256:  7.854123  0.94070539
#> 10257:  7.854123 -0.58877881
#> 10258:  9.981001 -5.15166422
#> 10259:  9.981001 -1.18617243
#> 10260:  9.981001 -2.71565663

^{使用reprex v2.0.2創建于 2022-10-26}

uj5u.com熱心網友回復：

一個整潔的 dplyr 解決方案，將每個位置的所有值添加到一個新列，然后過濾以洗掉幾個相同的組合：

library(tidyverse)

myfunc <- function(df) {
  df %>%
    add_column(other = list(.)) %>%
    unnest(other, names_sep = "_") %>%
    filter(!(cultivar == other_cultivar & fert_dose == other_fert_dose)) %>%
    mutate(diff = yield - other_yield)
}

datmeans <- dat %>%
  group_by(location, cultivar, fert_dose) %>%
  summarise(yield = mean(yield), .groups = "drop") %>%
  group_split(location) %>%
  map(myfunc) %>%
  bind_rows()

uj5u.com熱心網友回復：

一種data.table避免更大連接和后續過濾的解決方案。這將是快速且節省記憶體的。

dat_mean <- setDT(dat)[,.(mean_yield = mean(yield)), location:fert_dose][, doseIdx := match(fert_dose, unique(fert_dose))]

rbindlist(
  lapply(
    parse(
      text = c(
        ".(location, doseIdx > doseIdx)",
        ".(location, doseIdx < doseIdx)"
      )
    ),
    function(e) {
      dat_mean[
        dat_mean,
        .(
          location,
          cultivar1 = cultivar,
          fert_dose1 = fert_dose,
          yield1 = mean_yield,
          cultivar2 = i.cultivar,
          fert_dose2 = i.fert_dose,
          yield2 = i.mean_yield,
          diff = mean_yield - i.mean_yield
        ),
        on = eval(e)
      ]
    }
  )
)
#>      location cultivar1 fert_dose1    yield1 cultivar2 fert_dose2   yield2        diff
#>   1:     loc1     cult4       60kg  8.508675     cult1    no_fert 9.402345 -0.89367006
#>   2:     loc1     cult2       50kg  8.969489     cult1    no_fert 9.402345 -0.43285621
#>   3:     loc1     cult5       40kg  9.345814     cult1    no_fert 9.402345 -0.05653068
#>   4:     loc1     cult1       60kg 10.119129     cult1    no_fert 9.402345  0.71678444
#>   5:     loc1     cult4       50kg  9.638162     cult1    no_fert 9.402345  0.23581741
#>  ---                                                                                  
#> 926:     loc3     cult2       40kg  7.405098     cult5       60kg 8.442893 -1.03779484
#> 927:     loc3     cult5    no_fert  6.963170     cult5       60kg 8.442893 -1.47972253
#> 928:     loc3     cult1       50kg  9.444416     cult5       60kg 8.442893  1.00152346
#> 929:     loc3     cult4       40kg 10.264777     cult5       60kg 8.442893  1.82188419
#> 930:     loc3     cult2    no_fert  7.196217     cult5       60kg 8.442893 -1.24667516

轉載請註明出處，本文鏈接：https://www.uj5u.com/gongcheng/520916.html

標籤：rdplyr数据表

上一篇：在ggplot的圖例中添加背景透明度和自定義排序

下一篇：從出生日期計算年齡時處理NA值