一次合并R数据框中的列值
有没有办法一次将 R 数据列与其他列组合在一起?
例如,
asd <- data.frame(a = c("A","B"), b = c("d","f"), c = c("x","y"))
asd
asd <- data.frame(a = c("A","B"), b = c("d","f"), c = c("x","y"))
asd
预期输出(结合柱“A”与两柱B和C):
a b c
1 A d x
2 B f y
回答
您可以使用paste0第一列asd[[1]]和未列出的其他列,unlist(asd[-1])并将其分配回data.frame其他列的位置asd[-1]。
asd[-1] <- paste0(asd[[1]], unlist(asd[-1]))
# a b c
#1 A Ad Ax
#2 B Bf By
禁用recursive并use.names在unlist可能会提高性能:
asd[-1] <- paste0(asd[[1]], unlist(asd[-1], FALSE, FALSE))
相同但使用名称:
S <- c("b", "c")
asd[S] <- paste0(asd$a, unlist(asd[S]))
另一种方法是使用paste0inMap和 subsetasd一次,[-1]排除第一列并[rep(1,2)]获得第一列 2 次。
asd[-1] <- Map(paste0, asd[rep(1,2)], asd[-1])
相同但使用名称:
S <- c("b", "c")
asd[S] <- Map(paste0, asd[rep("a", length(S))], asd[S])
另一种方法是使用for循环;
for(i in 2:3) {asd[[i]] <- paste0(asd[[1]], asd[[i]])}
for(i in c("b", "c")) {asd[[i]] <- paste0(asd$a, asd[[i]])}
方法比较:
getDf <- function(nr, nc) { #function to creat example dataset
data.frame(a = sample(LETTERS, nr, TRUE),
setNames(replicate(nc, sample(letters, nr, TRUE), simplify=FALSE), paste0("b", seq_len(nc))))
}
library(dplyr)
library(stringr)
library(purrr)
M <- alist(
unlist = (function(asd) {asd[,-1] <- paste0(asd[,1], unlist(asd[,-1], FALSE, FALSE)); asd})(D)
, Map = (function(asd) {asd[-1] <- Map(paste0, asd[rep(1,ncol(asd)-1)], asd[-1]); asd})(D)
, "for" = (function(asd) {for(i in 2:ncol(asd)) {asd[[i]] <- paste0(asd[,1], asd[,i])}; asd})(D)
, "for+str_c" = (function(asd) {for(i in 2:ncol(asd)) {asd[[i]] <- str_c(asd[,1], asd[,i])}; asd})(D)
, lapply = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x)); asd})(D)
, across = (function(asd) {asd <- asd %>% mutate(across(-a, ~str_c(a, .x))); asd})(D)
, pmap = (function(asd) {asd <- asd %>%
pmap_dfr(~ c(list(...)[1], setNames(paste(..1, c(...)[-1], sep = ""), names(asd)[-1]))); as.data.frame(asd)})(D)
, "row+matrix" = (function(asd) {asd[-1] <- paste0(asd$a[row(asd[-1])], as.matrix(asd[-1])); asd})(D)
, apply = (function(asd) {asd[-1] <- apply(asd[-1], 2, function(x) paste0(asd[[1]], x)); asd})(D)
)
D <- getDf(1e5,2) #1e5 rows and 2 columsn
bench::mark(exprs = M)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 unlist 29.07ms 29.92ms 29.5 12.68MB 11.8 15 6 509ms
#2 Map 22.94ms 23.02ms 42.6 1.53MB 1.94 22 1 516.38ms
#3 for 22.84ms 22.96ms 42.8 1.53MB 1.94 22 1 514.15ms
#4 for+str_c 9.78ms 10ms 97.2 1.53MB 3.97 49 2 503.89ms
#5 lapply 22.89ms 23.01ms 42.7 1.53MB 1.94 22 1 514.82ms
#6 across 12.29ms 12.57ms 77.8 1.53MB 1.99 39 1 501.43ms
#7 pmap 2.95s 2.95s 0.339 9.54MB 6.45 1 19 2.95s
#8 row+matrix 30.64ms 32.65ms 19.8 14.97MB 6.09 13 4 656.35ms
#9 apply 32.93ms 34.12ms 27.7 19.55MB 5.94 14 3 504.85ms
#Warning message:
#Some expressions had a GC in every iteration; so filtering is disabled.
D <- getDf(1e2, 1e3)
bench::mark(exprs = M)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 unlist 21.4ms 21.7ms 45.2 18.08MB 9.68 14 3 310ms
#2 Map 28ms 28.1ms 35.3 12.53MB 4.41 16 2 453ms
#3 for 39.3ms 39.4ms 25.4 8.5MB 2.11 12 1 473ms
#4 for+str_c 34.1ms 34.3ms 29.1 8.5MB 4.48 13 2 447ms
#5 lapply 21.9ms 22.1ms 44.7 12.48MB 7.46 18 3 402ms
#6 across 80.3ms 80.9ms 12.3 5.98MB 4.93 5 2 406ms
#7 pmap 113.9ms 114ms 8.74 17.5MB 5.83 3 2 343ms
#8 row+matrix 24.5ms 24.6ms 40.2 19.31MB 10.7 15 4 373ms
#9 apply 32.3ms 32.5ms 30.5 21.72MB 11.1 11 4 360ms
关于内存使用across和for-loop 可以被推荐。关于速度在两行的情况下Map,for并lapply在1000行情况unlist,并lapply因此整体lapply可建议。也使用str_c代替paste可以提高性能。
如果所有列都具有相同的类型,则可以考虑将数据存储在matrix多列的情况下将显示优势的内容中。
M <- as.matrix(asd)
M[,-1] <- paste0(M[,1], M[,-1])
M
# a b c
#[1,] "A" "Ad" "Ax"
#[2,] "B" "Bf" "By"
D <- getDf(1e5,2)
M <- as.matrix(D)
bench::mark(check = FALSE #One gives a data frame the other a matirx
, lapply = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x))})(D)
, lapplyStr_C = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) stringr::str_c(asd$a, x))})(D)
, matrix = (function(M) {M[,-1] <- paste0(M[,1], M[,-1])})(M)
, matrixStr_C = (function(M) {M[,-1] <- stringr::str_c(M[,1], M[,-1])})(M)
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 lapply 28.3ms 28.8ms 34.7 1.53MB 0 18 0 519ms
#2 lapplyStr_C 13.6ms 13.9ms 71.6 1.53MB 2.05 35 1 489ms
#3 matrix 34.1ms 34.4ms 28.9 7.25MB 7.24 12 3 415ms
#4 matrixStr_C 17.8ms 18.2ms 53.9 7.25MB 7.35 22 3 408ms
D <- getDf(1e2, 1e3)
M <- as.matrix(D)
bench::mark(check = FALSE #One gives a data frame the other a matirx
, lapply = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x))})(D)
, lapplyStr_C = (function(asd) {asd[-1] <- lapply(asd[-1], function(x) stringr::str_c(asd$a, x))})(D)
, matrix = (function(M) {M[,-1] <- paste0(M[,1], M[,-1])})(M)
, matrixStr_C = (function(M) {M[,-1] <- stringr::str_c(M[,1], M[,-1])})(M)
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 lapply 32.41ms 32.66ms 30.5 12.48MB 15.2 10 5
#2 lapplyStr_C 26.85ms 27.11ms 36.9 12.48MB 18.4 12 6
#3 matrix 16.28ms 16.94ms 59.4 2.32MB 2.05 29 1
#4 matrixStr_C 7.51ms 7.77ms 127. 2.32MB 6.90 55 3
回答
您可以lapply在基础 R 中使用-
asd[-1] <- lapply(asd[-1], function(x) paste0(asd$a, x))
或者across在dplyr-
library(dplyr)
library(stringr)
asd %>% mutate(across(-a, ~str_c(a, .x)))
# a b c
#1 A Ad Ax
#2 B Bf By
回答
我们还可以使用以下pmap函数purrr:
library(purrr)
asd %>%
pmap_dfr(~ c(list(...)[1], setNames(paste(..1, c(...)[-1], sep = ""), names(asd)[-1])))
# A tibble: 2 x 3
a b c
<chr> <chr> <chr>
1 A Ad Ax
2 B Bf By