將匹配元素作為串列回傳的快速方法-有解無憂

我有兩個向量：

set.seed(1)

a <- sample(1:100,200, replace=T)

b <- sample(1:100,40, replace=F)

我想找到該a匹配中元素的位置b：

sapply(b, function(x) which(a %in% x))

這可以完成作業，但需要很長時間

有沒有辦法將結果存盤在一個實際上很快的串列中？

所需的輸出如下所示：

sapply(b, function(x) which(a %in% x))
[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

uj5u.com熱心網友回復：

這是解決您的問題的另一種方法。我還添加了@GKi、@ThomasIsCoding 和我提出的解決方案的基準。

res  = rep_len(list(integer(0)), length(b))
comm = intersect(b, a)                            # do not flip the order of elements
res[match(comm, b)] = lapply(comm, function(x) which(a==x))

[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

...

基準

f1_kamgang = function() {
  res  = rep_len(list(integer(0)), length(b))
  comm = intersect(b, a)                               # do not flip the order of elements
  res[match(comm, b)] = lapply(comm, function(x) which(a==x))
  res
}

f2_Thomas = function() {
  unname(by(seq_along(a), a, list)[as.character(b)])
}

f3_GKi = function() {
  sapply(b, function(x) which(a == x))     
}

f4_original = function() {
  sapply(b, function(x) which(a %in% x))     
}



microbenchmark::microbenchmark(
  f1_kamgang(),
  f2_Thomas(),
  f3_GKi(),
  f4_original(),
  times=5L
)

Unit: microseconds
          expr    min     lq    mean median     uq    max neval cld
  f1_kamgang()  191.2  194.8  209.70  214.3  216.5  231.7     5 a  
   f2_Thomas() 2105.4 2130.0 2240.76 2267.9 2308.8 2391.7     5   c
      f3_GKi()  223.5  231.0  242.72  242.6  253.9  262.6     5 a  
 f4_original()  365.5  366.3  393.10  376.9  399.6  457.2     5  b

uj5u.com熱心網友回復：

我們可以玩一個技巧并通過（但是，我想說這個選項很有趣但不快by）來索引串列，例如，as.character(b)

> unname(by(seq_along(a), a, list)[as.character(b)])
[[1]]
NULL

[[2]]
NULL

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
NULL

[[7]]
NULL

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

[[10]]
[1] 191

[[11]]
[1]  55 118

[[12]]
[1] 192

[[13]]
[1]  40  64 110 165

[[14]]
[1]  20  22 122 175

[[15]]
[1] 137 189

[[16]]
[1] 134

[[17]]
[1] 128

[[18]]
[1]  17  81 184

[[19]]
NULL

[[20]]
[1] 188 194

[[21]]
[1]  98 180

[[22]]
[1]  62 145

[[23]]
[1] 33

[[24]]
NULL

[[25]]
[1] 47

[[26]]
NULL

[[27]]
[1]  29 114 159

[[28]]
[1]  18  26 171

[[29]]
[1]  28  69 186 200

[[30]]
[1] 42

[[31]]
[1]  79 158 190

[[32]]
[1]  5 38 58 82

[[33]]
[1]  35  74 121

[[34]]
[1] 150

[[35]]
[1]  34  36 139

[[36]]
[1]  70 100 117 195

[[37]]
NULL

[[38]]
[1]  32  46 102

[[39]]
[1]  89 133

[[40]]
[1] 127 129 160

uj5u.com熱心網友回復：

您可以使用from%==%的更快版本（快 3 倍）。另一個可能派上用場的折疊功能是. 也比：whichcollapsewhichvlapplysapply

library(collapse)
lapply(b, function(x) a %==% x)
lapply(b, function(x) whichv(a, x))

identical(sapply(b, function(x) a %==% x),
          sapply(b, function(x) which(a %in% x)))
#[1] TRUE

基準

bench::mark(
  by = unname(by(seq_along(a), a, list)[as.character(b)]),
  original = sapply(b, function(x) which(a %in% x)),
  "==" = sapply(b, function(x) which(a == x)),
  collapse_s = sapply(b, function(x) collapse::whichv(a, x)),
  collapse_l = lapply(b, function(x) collapse::whichv(a, x)),
  "%==%" = lapply(b, function(x) a %==% x),
  time_unit = "ms",
  check = FALSE
)

# expression    min median `itr/sec` mem_a…1 gc/se…2 n_itr  n_gc total…3 result
# by         1.56   1.74        545.  27.1KB    15.6   244     7    448. <NULL>
# original   0.189  0.221      3770. 140.9KB    11.4  1659     5    440. <NULL>
# ==         0.103  0.118      7657.  74.8KB    13.8  3319     6    433. <NULL>
# collapse_s 0.0838 0.0917     9884.  39.8KB    15.8  4389     7    444. <NULL>
# collapse_l 0.0722 0.0794    11379.  38.9KB    13.5  4217     5    371. <NULL>
# %==%       0.0591 0.0648    13852.  38.8KB    15.5  6239     7    450. <NULL>

uj5u.com熱心網友回復：

這是一種有效的矢量化方法：

f <- function(x, y) {
  m <- y[match(x, y)]
  idx <- !is.na(m)
  unname(split(which(idx), factor(match(m[idx], b), levels = seq.int(length(y)))))
}

f(a, b)

[[1]]
integer(0)

[[2]]
integer(0)

[[3]]
[1] 107 142 199

[[4]]
[1] 109 126

[[5]]
[1] 136 167

[[6]]
integer(0)

[[7]]
integer(0)

[[8]]
[1]  73  91 176

[[9]]
[1]  51 146 181

[[10]]
[1] 191

...

uj5u.com熱心網友回復：

您可以使用==而不是%in%.

lapply(b, function(x) which(a == x))

另一種方法是C 使用rcpp.

Rcpp::cppFunction("
Rcpp::List getIdx(const Rcpp::IntegerVector& a,
                  const Rcpp::IntegerVector& b) {
  std::map<int, std::vector<int> > m;
  for(auto const& i : b) m[i].clear();
  for(int i=0; i<=a.size();   i) {
    auto j = m.find(a[i]);
    if(j != m.end()) j->second.push_back(i 1);
  }
  std::vector< std::vector<int> > res;
  for(auto const& i : b) res.push_back(m[i]);
  return wrap( res );
}")
getIdx(a, b)

基準

set.seed(1)
a <- sample(1:100,200, replace=T)
b <- sample(1:100,40, replace=F)

bench::mark(
         origninal = sapply(b, function(x) which(a %in% x)),
         "==" = lapply(b, function(x) which(a == x)),
         rcpp = getIdx(a, b)
       )
#  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_…1
#  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:tm>
#1 origninal  111.19μs 124.11μs     7090.   140.9KB     27.6  2824    11  398.3ms
#2 ==          66.77μs  72.78μs    12478.   73.83KB     26.9  5560    12  445.6ms
#3 rcpp         7.05μs   8.66μs   111343.    2.85KB     11.1  9999     1   89.8ms

rcpp 版本比原始版本快約 15 倍。

uj5u.com熱心網友回復：

如果您對長度為零的元素感到滿意，NA則可以將向量強制為 list 和replace.

replace(as.list(a), !a %in% b, NA)

轉載請註明出處，本文鏈接：https://www.uj5u.com/yidong/516182.html

標籤：r表现向量

上一篇：根據單獨表中規定的順序獲取SQL中任意字串的范圍

下一篇：使用cmake時避免被傳遞依賴阻塞