我有以下名為 df 的示例資料框(dput如下):
group date indicator
1 A 2022-11-01 01:00:00 FALSE
2 A 2022-11-01 03:00:00 FALSE
3 A 2022-11-01 04:00:00 TRUE
4 A 2022-11-01 05:00:00 FALSE
5 A 2022-11-01 06:00:00 TRUE
6 A 2022-11-01 07:00:00 FALSE
7 A 2022-11-01 10:00:00 FALSE
8 A 2022-11-01 12:00:00 FALSE
9 B 2022-11-01 01:00:00 FALSE
10 B 2022-11-01 02:00:00 FALSE
11 B 2022-11-01 03:00:00 FALSE
12 B 2022-11-01 06:00:00 TRUE
13 B 2022-11-01 07:00:00 FALSE
14 B 2022-11-01 08:00:00 FALSE
15 B 2022-11-01 11:00:00 TRUE
16 B 2022-11-01 13:00:00 FALSE
我想計算日期與indicator == TRUE每組最近的條件行之間的小時差。此外,具有 TRUE 的行應回傳 0 作為輸出。在這里您可以看到名為 df_desired 的所需輸出:
group date indicator diff_hours
1 A 2022-11-01 01:00:00 FALSE 3
2 A 2022-11-01 03:00:00 FALSE 1
3 A 2022-11-01 04:00:00 TRUE 0
4 A 2022-11-01 05:00:00 FALSE 1
5 A 2022-11-01 06:00:00 TRUE 0
6 A 2022-11-01 07:00:00 FALSE 1
7 A 2022-11-01 10:00:00 FALSE 4
8 A 2022-11-01 12:00:00 FALSE 6
9 B 2022-11-01 01:00:00 FALSE 5
10 B 2022-11-01 02:00:00 FALSE 4
11 B 2022-11-01 03:00:00 FALSE 3
12 B 2022-11-01 06:00:00 TRUE 0
13 B 2022-11-01 07:00:00 FALSE 1
14 B 2022-11-01 08:00:00 FALSE 2
15 B 2022-11-01 11:00:00 TRUE 0
16 B 2022-11-01 13:00:00 FALSE 2
所以我想知道是否有人知道如何計算每組最近的條件行的日期之間的小時差?
這里dput的 df 和 df_desired:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B"), date = structure(c(1667260800,
1667268000, 1667271600, 1667275200, 1667278800, 1667282400, 1667293200,
1667300400, 1667260800, 1667264400, 1667268000, 1667278800, 1667282400,
1667286000, 1667296800, 1667304000), class = c("POSIXct", "POSIXt"
), tzone = ""), indicator = c(FALSE, FALSE, TRUE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
TRUE, FALSE)), class = "data.frame", row.names = c(NA, -16L))
df_desired <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B"), date = structure(c(1667260800,
1667268000, 1667271600, 1667275200, 1667278800, 1667282400, 1667293200,
1667300400, 1667260800, 1667264400, 1667268000, 1667278800, 1667282400,
1667286000, 1667296800, 1667304000), class = c("POSIXct", "POSIXt"
), tzone = ""), indicator = c(FALSE, FALSE, TRUE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
TRUE, FALSE), diff_hours = c(3, 1, 0, 1, 0, 1, 4, 6, 5, 4, 3,
0, 1, 2, 0, 2)), class = "data.frame", row.names = c(NA, -16L
))
uj5u.com熱心網友回復:
與map_dbl:
library(dplyr)
library(purrr)
df %>%
group_by(group) %>%
mutate(diff_hours = map_dbl(date, ~ min(abs(.x - date[indicator]))))
輸出
# A tibble: 16 × 4
# Groups: group [2]
group date indicator diff_hours
<chr> <dttm> <lgl> <dbl>
1 A 2022-11-01 01:00:00 FALSE 3
2 A 2022-11-01 03:00:00 FALSE 1
3 A 2022-11-01 04:00:00 TRUE 0
4 A 2022-11-01 05:00:00 FALSE 1
5 A 2022-11-01 06:00:00 TRUE 0
6 A 2022-11-01 07:00:00 FALSE 1
7 A 2022-11-01 10:00:00 FALSE 4
8 A 2022-11-01 12:00:00 FALSE 6
9 B 2022-11-01 01:00:00 FALSE 5
10 B 2022-11-01 02:00:00 FALSE 4
11 B 2022-11-01 03:00:00 FALSE 3
12 B 2022-11-01 06:00:00 TRUE 0
13 B 2022-11-01 07:00:00 FALSE 1
14 B 2022-11-01 08:00:00 FALSE 2
15 B 2022-11-01 11:00:00 TRUE 0
16 B 2022-11-01 13:00:00 FALSE 2
如果想保持原來的差異(不是絕對的):
df %>%
group_by(group) %>%
mutate(diff_hours = map_dbl(date, ~ (.x - date[indicator])[which.min(abs(.x - date[indicator]))]))
uj5u.com熱心網友回復:
你可以data.table像下面這樣嘗試(應該有比我更有效的選項)
- 使用
findInterval或roll = "nearest"
setDT(df)[
,
diff_hours := abs(
difftime(date,
date[indicator][pmax(1, findInterval(date, date[indicator]))],
units = "hours"
)
),
group
][]
或者
setDT(df)[
,
diffhours := abs(
difftime(date,
.SD[indicator][.SD,
date,
by = group,
on = "date",
roll = "nearest",
mult = "first"
][, date],
units = "hours"
)
)
][]
這使
group date indicator diff_hours
1: A 2022-11-01 01:00:00 FALSE 3 hours
2: A 2022-11-01 03:00:00 FALSE 1 hours
3: A 2022-11-01 04:00:00 TRUE 0 hours
4: A 2022-11-01 05:00:00 FALSE 1 hours
5: A 2022-11-01 06:00:00 TRUE 0 hours
6: A 2022-11-01 07:00:00 FALSE 1 hours
7: A 2022-11-01 10:00:00 FALSE 4 hours
8: A 2022-11-01 12:00:00 FALSE 6 hours
9: B 2022-11-01 01:00:00 FALSE 5 hours
10: B 2022-11-01 02:00:00 FALSE 4 hours
11: B 2022-11-01 03:00:00 FALSE 3 hours
12: B 2022-11-01 06:00:00 TRUE 0 hours
13: B 2022-11-01 07:00:00 FALSE 1 hours
14: B 2022-11-01 08:00:00 FALSE 2 hours
15: B 2022-11-01 11:00:00 TRUE 0 hours
16: B 2022-11-01 13:00:00 FALSE 2 hours
- 使用
outer(由于使用效率apply不高)
setDT(df)[
,
diff_hours := apply(abs(outer(date, date[indicator], `-`)), 1, min) / 3600,
group
][]
你會看到
group date indicator diff_hours
1: A 2022-11-01 01:00:00 FALSE 3
2: A 2022-11-01 03:00:00 FALSE 1
3: A 2022-11-01 04:00:00 TRUE 0
4: A 2022-11-01 05:00:00 FALSE 1
5: A 2022-11-01 06:00:00 TRUE 0
6: A 2022-11-01 07:00:00 FALSE 1
7: A 2022-11-01 10:00:00 FALSE 4
8: A 2022-11-01 12:00:00 FALSE 6
9: B 2022-11-01 01:00:00 FALSE 5
10: B 2022-11-01 02:00:00 FALSE 4
11: B 2022-11-01 03:00:00 FALSE 3
12: B 2022-11-01 06:00:00 TRUE 0
13: B 2022-11-01 07:00:00 FALSE 1
14: B 2022-11-01 08:00:00 FALSE 2
15: B 2022-11-01 11:00:00 TRUE 0
16: B 2022-11-01 13:00:00 FALSE 2
uj5u.com熱心網友回復:
以下是 base R 中的一些先前方法:
#Ma?l answer in base R
by(df, df$group, \(d) transform(
d, diff_hours = sapply(d$date, \(x) min(abs(x - d$date[d[["indicator"]]])))
)) |>
do.call(what = rbind.data.frame)
#> group date indicator diff_hours
#> A.1 A 2022-10-31 20:00:00 FALSE 3
#> A.2 A 2022-10-31 22:00:00 FALSE 1
#> A.3 A 2022-10-31 23:00:00 TRUE 0
#> A.4 A 2022-11-01 00:00:00 FALSE 1
#> A.5 A 2022-11-01 01:00:00 TRUE 0
#> A.6 A 2022-11-01 02:00:00 FALSE 1
#> A.7 A 2022-11-01 05:00:00 FALSE 4
#> A.8 A 2022-11-01 07:00:00 FALSE 6
#> B.9 B 2022-10-31 20:00:00 FALSE 5
#> B.10 B 2022-10-31 21:00:00 FALSE 4
#> B.11 B 2022-10-31 22:00:00 FALSE 3
#> B.12 B 2022-11-01 01:00:00 TRUE 0
#> B.13 B 2022-11-01 02:00:00 FALSE 1
#> B.14 B 2022-11-01 03:00:00 FALSE 2
#> B.15 B 2022-11-01 06:00:00 TRUE 0
#> B.16 B 2022-11-01 08:00:00 FALSE 2
#ThomasIsCoding answer in base
transform(df, diff_hours = apply(abs(outer(df$date, df$date[df$indicator], `-`))/3600, 1, min))
#> group date indicator diff_hours
#> 1 A 2022-10-31 20:00:00 FALSE 3
#> 2 A 2022-10-31 22:00:00 FALSE 1
#> 3 A 2022-10-31 23:00:00 TRUE 0
#> 4 A 2022-11-01 00:00:00 FALSE 1
#> 5 A 2022-11-01 01:00:00 TRUE 0
#> 6 A 2022-11-01 02:00:00 FALSE 1
#> 7 A 2022-11-01 05:00:00 FALSE 1
#> 8 A 2022-11-01 07:00:00 FALSE 1
#> 9 B 2022-10-31 20:00:00 FALSE 3
#> 10 B 2022-10-31 21:00:00 FALSE 2
#> 11 B 2022-10-31 22:00:00 FALSE 1
#> 12 B 2022-11-01 01:00:00 TRUE 0
#> 13 B 2022-11-01 02:00:00 FALSE 1
#> 14 B 2022-11-01 03:00:00 FALSE 2
#> 15 B 2022-11-01 06:00:00 TRUE 0
#> 16 B 2022-11-01 08:00:00 FALSE 2
uj5u.com熱心網友回復:
解決方案基于tidyr::fill():
library(dplyr)
library(tidyr)
df %>%
arrange(group, date) %>%
mutate(
ind_prev = if_else(indicator, date, as.POSIXct(NA)),
ind_next = ind_prev
) %>%
group_by(group) %>%
fill(ind_prev, .direction = "down") %>%
fill(ind_next, .direction = "up") %>%
ungroup() %>%
mutate(
across(
ind_prev:ind_next,
~ abs(as.numeric(date - .x, unit = "hours"))
),
diff_hours = pmin(ind_prev, ind_next, na.rm = TRUE)
) %>%
select(!ind_prev:ind_next)
# A tibble: 16 × 4
group date indicator diff_hours
<chr> <dttm> <lgl> <dbl>
1 A 2022-11-01 00:00:00 FALSE 3
2 A 2022-11-01 02:00:00 FALSE 1
3 A 2022-11-01 03:00:00 TRUE 0
4 A 2022-11-01 04:00:00 FALSE 1
5 A 2022-11-01 05:00:00 TRUE 0
6 A 2022-11-01 06:00:00 FALSE 1
7 A 2022-11-01 09:00:00 FALSE 4
8 A 2022-11-01 11:00:00 FALSE 6
9 B 2022-11-01 00:00:00 FALSE 5
10 B 2022-11-01 01:00:00 FALSE 4
11 B 2022-11-01 02:00:00 FALSE 3
12 B 2022-11-01 05:00:00 TRUE 0
13 B 2022-11-01 06:00:00 FALSE 1
14 B 2022-11-01 07:00:00 FALSE 2
15 B 2022-11-01 10:00:00 TRUE 0
16 B 2022-11-01 12:00:00 FALSE 2
uj5u.com熱心網友回復:
使用函式族的基本 R解決方案。apply首先split按組設定,然后將所有日期與指標 == TRUE日期進行比較,然后選擇min.
cbind(df, diff_hours = unlist(lapply(split(df, df$group), function(grp)
apply(sapply(grp[grp$indicator == T, "date"], function(date_T)
abs(difftime(date_T, grp$date, u = "hour"))), 1, min))))
group date indicator diff_hours
A1 A 2022-11-01 01:00:00 FALSE 3
A2 A 2022-11-01 03:00:00 FALSE 1
A3 A 2022-11-01 04:00:00 TRUE 0
A4 A 2022-11-01 05:00:00 FALSE 1
A5 A 2022-11-01 06:00:00 TRUE 0
A6 A 2022-11-01 07:00:00 FALSE 1
A7 A 2022-11-01 10:00:00 FALSE 4
A8 A 2022-11-01 12:00:00 FALSE 6
B1 B 2022-11-01 01:00:00 FALSE 5
B2 B 2022-11-01 02:00:00 FALSE 4
B3 B 2022-11-01 03:00:00 FALSE 3
B4 B 2022-11-01 06:00:00 TRUE 0
B5 B 2022-11-01 07:00:00 FALSE 1
B6 B 2022-11-01 08:00:00 FALSE 2
B7 B 2022-11-01 11:00:00 TRUE 0
B8 B 2022-11-01 13:00:00 FALSE 2
轉載請註明出處,本文鏈接:https://www.uj5u.com/qukuanlian/534942.html
上一篇:如何在資料幀的子集上添加時間增量
