如何重写循环以在R中运行得更快?
给定 > 900,000 行的数据集,其中length(duplicates) = >300,000,以下循环在 R 中运行需要大约 4 小时,这是不可接受的。
for(i in duplicates) {
couple_table <- filter(data, pnr == i) # filter patients
min_date <- min(couple_table$date) # determine date of first operation
max_date <- max(couple_table$date) # determine date of second operation
data$first[data$pnr == i & data$date == min_date] <- 1 # assign 1 to column first
data$second[data$pnr == i & data$date == max_date] <- 1 # assign 1 to column second
}
如何调整此代码以在 R 中运行得更快?我看过*apply但我一点也不熟悉,有什么想法吗?
虚拟数据:
data <- data.frame(pnr = c('a43','a4945', 'a43', 'a231', 'a231', 'a6901'),
date = c(as.Date('2011-12-19'), as.Date('2012-09-11'), as.Date('2013-10-01'),
as.Date('2012-05-09'), as.Date('2009-09-10'), as.Date('2015-06-12')))
duplicates <- as.character(data$pnr[duplicated(data$pnr)])
回答
分组操作会更快
library(dplyr)
data %>%
group_by(pnr) %>%
mutate(Min = if(n() > 1) NA^(date != min(date)) else NA,
Max = if(n() > 1) NA^(date != max(date)) else NA) %>%
ungroup
-输出
# A tibble: 6 x 4
# pnr date Min Max
# <chr> <date> <dbl> <dbl>
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
类似的逻辑data.table是
library(data.table)
setDT(data)[, c('Min', 'Max') := .(if(.N > 1)
NA^(date != min(date)) else NA, if(.N> 1)
NA^(date != max(date)) else NA), .(pnr)]
或者可以collapse用于更快的执行
library(collapse)
data %>%
ftransform(n = fNobs(date, pnr, TRA = 'replace_fill')) %>%
ftransform(Min = NA^(fmin(date, pnr, TRA = "replace_fill") != date | n == 1),
Max = NA^(fmax(date, pnr, TRA = "replace_fill") != date | n == 1), n = NULL )
# pnr date Min Max
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
或base R与duplicated
i1 <- with(data, duplicated(pnr)|duplicated(pnr, fromLast = TRUE))
data$Min <- with(data, i1 & date == ave(date, pnr, FUN = min))
data$Max <- with(data, i1 & date == ave(date, pnr, FUN = max))