我有兩個不完整的資料框:缺少列或 NA 值。“by”是合并索引,df_a 比 df_b 具有“優先級”。
df_a = data.frame("by" = c(1,2,3),
"a" = c(1,2,3),
"b" = c(1,NA,3))
df_b = data.frame("by" = c(2,3,4),
"a" = c(7,NA,4),
"c" = c(2,3,4))
想要的結果:
by a b c
1 1 1 1 NA
2 2 7 NA 2
3 3 3 3 3
4 4 4 NA 4
有沒有人知道如何使用 R base 有效地做到這一點?提前致謝!
uj5u.com熱心網友回復:
這不是最優雅的,但您可以創建一個函式,應用您的規則來合并值,如果它們出現在兩個資料框中。
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
這將給出以下結果。
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
我知道你指定的基數,靠natural_join()功能值得一提。
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
這正是您想要的。
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
uj5u.com熱心網友回復:
get_complete_df<-function(df_a,df_b, by = "by"){
# df_a has priority!
overlab_b = df_b[df_b[[by]] %in% df_a[[by]],names(df_b) %in% names(df_a)]
overlab_a = df_a[df_a[[by]] %in% df_b[[by]],names(df_a) %in% names(df_b)]
df_a[df_a[[by]] %in% df_b[[by]],names(df_a) %in% names(df_b)] = ifelse(is.na(overlab_a ),unlist(overlab_b ),unlist(overlab_a ) )
all_names = unique(c(names(df_a),names(df_b)) )
all_by = unique(c(df_a[[by]],df_b[[by]]) )
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
df_o[df_o[[by]] %in% df_b[[by]],names(df_o) %in% names(df_b)] =
df_b[df_b[[by]] %in% df_o[[by]],names(df_b) %in% names(df_o)]
df_o[df_o[[by]] %in% df_a[[by]],names(df_o) %in% names(df_a)] =
df_a[df_a[[by]] %in% df_o[[by]],names(df_a) %in% names(df_o)]
df_o
}
> get_complete_df(df_a,df_b)
by a b c
1 1 1 1 NA
2 2 7 NA 2
3 3 3 3 3
4 4 4 NA 4
> get_complete_df(df_a= df_b,df_b = df_a)
by a c b
1 1 1 NA 1
2 2 7 2 NA
3 3 3 3 NA
4 4 4 4 3
>
uj5u.com熱心網友回復:
不是 R 基礎的答案。但是該軟體包的一種可能解決方案data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
在@r2evans 的幫助下進行編輯,一個更加優雅和高效的解決方案:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
由reprex 包(v2.0.1)于 2021 年 10 月 19 日創建
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/325697.html
