一次合并R数据框中的列值

有没有办法一次将 R 数据列与其他列组合在一起?

例如,

asd <- data.frame(a = c("A","B"), b = c("d","f"), c = c("x","y"))
asd
asd <- data.frame(a = c("A","B"), b = c("d","f"), c = c("x","y"))
asd

预期输出(结合柱“A”与柱B和C):

  a b c
1 A d x
2 B f y

回答

您可以使用paste0第一列asd[[1]]和未列出的其他列,unlist(asd[-1])并将其分配回data.frame其他列的位置asd[-1]

asd[-1] <- paste0(asd[[1]], unlist(asd[-1]))
#  a  b  c
#1 A Ad Ax
#2 B Bf By

禁用recursiveuse.namesunlist可能会提高性能:

asd[-1] <- paste0(asd[[1]], unlist(asd[-1], FALSE, FALSE))

相同但使用名称:

S <- c("b", "c")
asd[S] <- paste0(asd$a, unlist(asd[S]))

另一种方法是使用paste0inMap和 subsetasd一次,[-1]排除第一列并[rep(1,2)]获得第一列 2 次。

asd[-1] <- Map(paste0, asd[rep(1,2)], asd[-1])

相同但使用名称:

S <- c("b", "c")
asd[S] <- Map(paste0, asd[rep("a", length(S))], asd[S])

另一种方法是使用for循环;

for(i in 2:3) {asd[[i]] <- paste0(asd[[1]], asd[[i]])}

for(i in c("b", "c")) {asd[[i]] <- paste0(asd$a, asd[[i]])}

方法比较:

getDf <- function(nr, nc) { #function to creat example dataset
    data.frame(a = sample(LETTERS, nr, TRUE),
               setNames(replicate(nc, sample(letters, nr, TRUE), simplify=FALSE), paste0("b", seq_len(nc))))
}

library(dplyr)
library(stringr)
library(purrr)
M <- alist(
    unlist = (function(asd) {asd[,-1] <- paste0(asd[,1], unlist(asd[,-1], FALSE, FALSE)); asd})(D)
  , Map = (function(asd) {asd[-1] <- Map(paste0, asd[rep(1,ncol(asd)-1)], asd[-1]); asd})(D)
  , "for" = (function(asd) {for(i in 2:ncol(asd)) {asd[[i]] <- paste0(asd[,1], asd[,i])}; asd})(D)
  , "for+str_c" = (function(asd) {for(i in 2:ncol(asd)) {asd[[i]] <- str_c(asd[,1], asd[,i])}; asd})(D)
  , lapply = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x)); asd})(D)
  , across = (function(asd) {asd <- asd %>% mutate(across(-a, ~str_c(a, .x))); asd})(D)
  , pmap = (function(asd) {asd <- asd %>%
  pmap_dfr(~ c(list(...)[1], setNames(paste(..1, c(...)[-1], sep = ""), names(asd)[-1]))); as.data.frame(asd)})(D)
  , "row+matrix" = (function(asd) {asd[-1] <- paste0(asd$a[row(asd[-1])], as.matrix(asd[-1])); asd})(D)
  , apply = (function(asd) {asd[-1] <- apply(asd[-1], 2, function(x) paste0(asd[[1]], x)); asd})(D)
)
D <- getDf(1e5,2) #1e5 rows and 2 columsn
bench::mark(exprs = M)
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 unlist     29.07ms 29.92ms    29.5     12.68MB    11.8     15     6      509ms
#2 Map        22.94ms 23.02ms    42.6      1.53MB     1.94    22     1   516.38ms
#3 for        22.84ms 22.96ms    42.8      1.53MB     1.94    22     1   514.15ms
#4 for+str_c   9.78ms    10ms    97.2      1.53MB     3.97    49     2   503.89ms
#5 lapply     22.89ms 23.01ms    42.7      1.53MB     1.94    22     1   514.82ms
#6 across     12.29ms 12.57ms    77.8      1.53MB     1.99    39     1   501.43ms
#7 pmap         2.95s   2.95s     0.339    9.54MB     6.45     1    19      2.95s
#8 row+matrix 30.64ms 32.65ms    19.8     14.97MB     6.09    13     4   656.35ms
#9 apply      32.93ms 34.12ms    27.7     19.55MB     5.94    14     3   504.85ms
#Warning message:
#Some expressions had a GC in every iteration; so filtering is disabled. 
D <- getDf(1e2, 1e3)
bench::mark(exprs = M)
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 unlist      21.4ms  21.7ms     45.2    18.08MB     9.68    14     3      310ms
#2 Map           28ms  28.1ms     35.3    12.53MB     4.41    16     2      453ms
#3 for         39.3ms  39.4ms     25.4      8.5MB     2.11    12     1      473ms
#4 for+str_c   34.1ms  34.3ms     29.1      8.5MB     4.48    13     2      447ms
#5 lapply      21.9ms  22.1ms     44.7    12.48MB     7.46    18     3      402ms
#6 across      80.3ms  80.9ms     12.3     5.98MB     4.93     5     2      406ms
#7 pmap       113.9ms   114ms      8.74    17.5MB     5.83     3     2      343ms
#8 row+matrix  24.5ms  24.6ms     40.2    19.31MB    10.7     15     4      373ms
#9 apply       32.3ms  32.5ms     30.5    21.72MB    11.1     11     4      360ms

关于内存使用acrossfor-loop 可以被推荐。关于速度在两行的情况下Mapforlapply在1000行情况unlist,并lapply因此整体lapply可建议。也使用str_c代替paste可以提高性能。


如果所有列都具有相同的类型,则可以考虑将数据存储在matrix多列的情况下将显示优势的内容中。

M <- as.matrix(asd)

M[,-1] <- paste0(M[,1], M[,-1])

M
#     a   b    c   
#[1,] "A" "Ad" "Ax"
#[2,] "B" "Bf" "By"
D <- getDf(1e5,2)
M <- as.matrix(D)
bench::mark(check = FALSE #One gives a data frame the other a matirx
 , lapply = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x))})(D)
 , lapplyStr_C = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) stringr::str_c(asd$a, x))})(D)
 , matrix = (function(M) {M[,-1] <- paste0(M[,1], M[,-1])})(M)
 , matrixStr_C = (function(M) {M[,-1] <- stringr::str_c(M[,1], M[,-1])})(M)
)
#  expression      min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr>  <bch:t> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 lapply       28.3ms 28.8ms      34.7    1.53MB     0       18     0      519ms
#2 lapplyStr_C  13.6ms 13.9ms      71.6    1.53MB     2.05    35     1      489ms
#3 matrix       34.1ms 34.4ms      28.9    7.25MB     7.24    12     3      415ms
#4 matrixStr_C  17.8ms 18.2ms      53.9    7.25MB     7.35    22     3      408ms
D <- getDf(1e2, 1e3)
M <- as.matrix(D)
bench::mark(check = FALSE #One gives a data frame the other a matirx
 , lapply = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x))})(D)
 , lapplyStr_C = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) stringr::str_c(asd$a, x))})(D)
 , matrix = (function(M) {M[,-1] <- paste0(M[,1], M[,-1])})(M)
 , matrixStr_C = (function(M) {M[,-1] <- stringr::str_c(M[,1], M[,-1])})(M)
)
#  expression       min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#  <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
#1 lapply       32.41ms  32.66ms      30.5   12.48MB    15.2     10     5
#2 lapplyStr_C  26.85ms  27.11ms      36.9   12.48MB    18.4     12     6
#3 matrix       16.28ms  16.94ms      59.4    2.32MB     2.05    29     1
#4 matrixStr_C   7.51ms   7.77ms     127.     2.32MB     6.90    55     3


回答

您可以lapply在基础 R 中使用-

asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x))

或者acrossdplyr-

library(dplyr)
library(stringr)

asd %>% mutate(across(-a, ~str_c(a, .x)))

#  a  b  c
#1 A Ad Ax
#2 B Bf By


回答

我们还可以使用以下pmap函数purrr

library(purrr)

asd %>%
  pmap_dfr(~ c(list(...)[1], setNames(paste(..1, c(...)[-1], sep = ""), names(asd)[-1])))

# A tibble: 2 x 3
  a     b     c
  <chr> <chr> <chr>
1 A     Ad    Ax
2 B     Bf    By


以上是一次合并R数据框中的列值的全部内容。
THE END
分享
二维码
< <上一篇
下一篇>>