使用dplyr在数据帧的每一列(对应的单词)中查找最大数
我有一个有 9 列和 20 行的数据框。如下图所示的数据框图片:(
数据在本题末尾使用dput函数提供)
从第一个单词 (fireplace_balconi) 可以看出,8 个数字显示在第一列到第八列中。我想要做的是在每列中找到前 5 个数字,并在单独的数据框中从单词列中显示它们对应的单词。
例如,在第 1 列中,最上面的数字是 6.61 e-4,这个数字对应的词是 pool_exercis。
我可以通过以下代码为每一列分别执行此操作:
data2 <- data[,c(1,9)]
data3 <- arrange(data2, desc(t_1))
然后选择前 5 个单词,但我想在一个步骤中对所有列执行此操作,并将输出作为数据帧。我怎样才能做到这一点?
数据:
structure(list(t_1 = c(0.000130787327029859, 2.04812368950562e-06,
0.000279032470270266, 9.75296995002676e-08, 8.78742592497411e-05,
9.75296995002676e-08, 0.000661348892311314, 8.59236652597357e-05,
0.000189305146730019, 9.75296995002676e-08, 6.44671313696769e-05,
9.75296995002676e-08, 9.75296995002676e-08, 4.3010597479618e-05,
7.81212892997143e-05, 0.000353155041890469, 4.69117854596287e-05,
9.75296995002676e-08, 9.75296995002676e-08, 9.75296995002676e-08
), t_2 = c(0.000237523589560676, 7.19550407636099e-08, 7.19550407636099e-08,
7.19550407636099e-08, 0.000120956423523628, 0.000382872771903168,
7.19550407636099e-08, 0.000413093889023885, 9.36135080334565e-05,
7.19550407636099e-08, 1.15847615629412e-05, 4.18058786836574e-05,
7.05878949891013e-05, 3.02930721614798e-05, 2.95015667130801e-06,
7.19550407636099e-08, 0.000182837758580333, 2.88539713462076e-05,
0.000502318139570761, 0.000428923997991879), t_3 = c(5.33688163157398e-05,
7.99582621669981e-05, 1.89924613223273e-07, 1.89924613223273e-07,
1.89924613223273e-07, 1.89924613223273e-07, 4.57718317868089e-05,
3.0577862728947e-05, 0.00029647232124153, 1.89924613223273e-07,
0.000148331122927376, 1.53838936710851e-05, 3.81748472578779e-05,
1.89924613223273e-07, 1.89924613223273e-07, 0.000114144692547187,
1.89924613223273e-07, 1.89924613223273e-07, 1.89924613223273e-07,
1.89924613223273e-07), t_4 = c(8.54762655821952e-05, 6.10893139182308e-05,
1.21934758319822e-07, 2.56062992471626e-06, 0.000387874466215354,
1.21934758319822e-07, 1.21934758319822e-07, 2.45088864222842e-05,
6.35280090846272e-05, 1.21934758319822e-07, 0.000197656243236431,
0.000775626997672387, 1.21934758319822e-07, 0.000168391901239674,
1.21934758319822e-07, 1.21934758319822e-07, 0.000382997075882561,
0.000634182678021394, 1.2315410590302e-05, 9.27923510813845e-05
), t_5 = c(2.86680234286555e-07, 0.000298434123892303, 0.000338569356692421,
2.86680234286555e-07, 4.04219130344042e-05, 2.86680234286555e-07,
2.86680234286555e-07, 2.86680234286555e-07, 2.86680234286555e-07,
1.17538896057487e-05, 2.86680234286555e-07, 2.86680234286555e-07,
2.86680234286555e-07, 0.00017802842549195, 0.00145088866572425,
2.86680234286555e-07, 2.86680234286555e-07, 2.86680234286555e-07,
2.86680234286555e-07, 2.86680234286555e-07), t_6 = c(5.63943589850594e-07,
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07,
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07,
5.63943589850594e-07, 5.63943589850594e-07, 0.000429161071876302,
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07,
5.63943589850594e-07, 5.63943589850594e-07, 0.000981825789929885,
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07,
5.63943589850594e-07), t_7 = c(1.02250461072892e-07, 0.00025777341236476,
1.02250461072892e-07, 0.000713810468749856, 1.02250461072892e-07,
1.02250461072892e-07, 1.02250461072892e-07, 2.14725968253072e-06,
9.00826562052175e-05, 0.000539984684925941, 0.000298673596793916,
1.02250461072892e-07, 0.000560434777140519, 0.000237323320150181,
0.000110532748419796, 0.000112577757641254, 1.02250461072892e-07,
0.000149387923627495, 1.02250461072892e-07, 1.02250461072892e-07
), t_8 = c(0.000191269160117583, 0.000345858488647592, 0.000416126365252142,
1.40535753209099e-07, 0.000151919149219036, 0.000236240601144495,
1.40535753209099e-07, 1.40535753209099e-07, 0.000112569138320488,
0.00012943342870558, 0.000121001283513034, 1.40535753209099e-07,
5.07334069084847e-05, 0.000255915606593769, 8.57268094575503e-06,
1.40535753209099e-07, 0.000123811998577216, 1.40535753209099e-07,
1.40535753209099e-07, 4.51119767801207e-05), word = c("fireplac_balconi",
"hst", "public_librari", "rain_shower", "ceil_ga", "laundri_ga",
"pool_exercis", "recent_upgrad", "heart_citi", "kitchenaid",
"openconcept", "size_master", "toilet", "washroom", "elizabeth",
"equip_fit", "fireplac_live", "stair", "treed_outlook", "updat_bedroom"
)), row.names = c(NA, 20L), class = "data.frame")
回答
order每列按降序排列,然后从中选择前 5 名word。
使用dplyr:
library(dplyr)
data %>%
summarise(across(starts_with('t_'),
~word[head(order(., decreasing = TRUE), 5)]))
# t_1 t_2 t_3 t_4
#1 pool_exercis treed_outlook heart_citi size_master
#2 equip_fit updat_bedroom openconcept stair
#3 public_librari recent_upgrad equip_fit ceil_ga
#4 heart_citi laundri_ga hst fireplac_live
#5 fireplac_balconi fireplac_balconi fireplac_balconi openconcept
# t_5 t_6 t_7 t_8
#1 elizabeth equip_fit rain_shower public_librari
#2 public_librari kitchenaid toilet hst
#3 hst fireplac_balconi kitchenaid washroom
#4 washroom hst openconcept laundri_ga
#5 ceil_ga public_librari hst fireplac_balconi
基础 R :
sapply(data[1:8], function(x) data$word[tail(order(x, decreasing = TRUE), 5)])
如果您可以使用长格式数据,则可以执行以下操作:
data %>%
tidyr::pivot_longer(cols = starts_with('t_')) %>%
group_by(name) %>%
slice_max(value, n = 5) %>%
ungroup