使用dplyr在数据帧的每一列(对应的单词)中查找最大数

我有一个有 9 列和 20 行的数据框。如下图所示的数据框图片:(
数据在本题末尾使用dput函数提供)

从第一个单词 (fireplace_balconi) 可以看出,8 个数字显示在第一列到第八列中。我想要做的是在每列中找到前 5 个数字,并在单独的数据框中从单词列中显示它们对应的单词。
例如,在第 1 列中,最上面的数字是 6.61 e-4,这个数字对应的词是 pool_exercis。

我可以通过以下代码为每一列分别执行此操作:

data2 <- data[,c(1,9)]
data3 <- arrange(data2, desc(t_1))

然后选择前 5 个单词,但我想在一个步骤中对所有列执行此操作,并将输出作为数据帧。我怎样才能做到这一点?

数据:

structure(list(t_1 = c(0.000130787327029859, 2.04812368950562e-06, 
0.000279032470270266, 9.75296995002676e-08, 8.78742592497411e-05, 
9.75296995002676e-08, 0.000661348892311314, 8.59236652597357e-05, 
0.000189305146730019, 9.75296995002676e-08, 6.44671313696769e-05, 
9.75296995002676e-08, 9.75296995002676e-08, 4.3010597479618e-05, 
7.81212892997143e-05, 0.000353155041890469, 4.69117854596287e-05, 
9.75296995002676e-08, 9.75296995002676e-08, 9.75296995002676e-08
), t_2 = c(0.000237523589560676, 7.19550407636099e-08, 7.19550407636099e-08, 
7.19550407636099e-08, 0.000120956423523628, 0.000382872771903168, 
7.19550407636099e-08, 0.000413093889023885, 9.36135080334565e-05, 
7.19550407636099e-08, 1.15847615629412e-05, 4.18058786836574e-05, 
7.05878949891013e-05, 3.02930721614798e-05, 2.95015667130801e-06, 
7.19550407636099e-08, 0.000182837758580333, 2.88539713462076e-05, 
0.000502318139570761, 0.000428923997991879), t_3 = c(5.33688163157398e-05, 
7.99582621669981e-05, 1.89924613223273e-07, 1.89924613223273e-07, 
1.89924613223273e-07, 1.89924613223273e-07, 4.57718317868089e-05, 
3.0577862728947e-05, 0.00029647232124153, 1.89924613223273e-07, 
0.000148331122927376, 1.53838936710851e-05, 3.81748472578779e-05, 
1.89924613223273e-07, 1.89924613223273e-07, 0.000114144692547187, 
1.89924613223273e-07, 1.89924613223273e-07, 1.89924613223273e-07, 
1.89924613223273e-07), t_4 = c(8.54762655821952e-05, 6.10893139182308e-05, 
1.21934758319822e-07, 2.56062992471626e-06, 0.000387874466215354, 
1.21934758319822e-07, 1.21934758319822e-07, 2.45088864222842e-05, 
6.35280090846272e-05, 1.21934758319822e-07, 0.000197656243236431, 
0.000775626997672387, 1.21934758319822e-07, 0.000168391901239674, 
1.21934758319822e-07, 1.21934758319822e-07, 0.000382997075882561, 
0.000634182678021394, 1.2315410590302e-05, 9.27923510813845e-05
), t_5 = c(2.86680234286555e-07, 0.000298434123892303, 0.000338569356692421, 
2.86680234286555e-07, 4.04219130344042e-05, 2.86680234286555e-07, 
2.86680234286555e-07, 2.86680234286555e-07, 2.86680234286555e-07, 
1.17538896057487e-05, 2.86680234286555e-07, 2.86680234286555e-07, 
2.86680234286555e-07, 0.00017802842549195, 0.00145088866572425, 
2.86680234286555e-07, 2.86680234286555e-07, 2.86680234286555e-07, 
2.86680234286555e-07, 2.86680234286555e-07), t_6 = c(5.63943589850594e-07, 
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07, 
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07, 
5.63943589850594e-07, 5.63943589850594e-07, 0.000429161071876302, 
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07, 
5.63943589850594e-07, 5.63943589850594e-07, 0.000981825789929885, 
5.63943589850594e-07, 5.63943589850594e-07, 5.63943589850594e-07, 
5.63943589850594e-07), t_7 = c(1.02250461072892e-07, 0.00025777341236476, 
1.02250461072892e-07, 0.000713810468749856, 1.02250461072892e-07, 
1.02250461072892e-07, 1.02250461072892e-07, 2.14725968253072e-06, 
9.00826562052175e-05, 0.000539984684925941, 0.000298673596793916, 
1.02250461072892e-07, 0.000560434777140519, 0.000237323320150181, 
0.000110532748419796, 0.000112577757641254, 1.02250461072892e-07, 
0.000149387923627495, 1.02250461072892e-07, 1.02250461072892e-07
), t_8 = c(0.000191269160117583, 0.000345858488647592, 0.000416126365252142, 
1.40535753209099e-07, 0.000151919149219036, 0.000236240601144495, 
1.40535753209099e-07, 1.40535753209099e-07, 0.000112569138320488, 
0.00012943342870558, 0.000121001283513034, 1.40535753209099e-07, 
5.07334069084847e-05, 0.000255915606593769, 8.57268094575503e-06, 
1.40535753209099e-07, 0.000123811998577216, 1.40535753209099e-07, 
1.40535753209099e-07, 4.51119767801207e-05), word = c("fireplac_balconi", 
"hst", "public_librari", "rain_shower", "ceil_ga", "laundri_ga", 
"pool_exercis", "recent_upgrad", "heart_citi", "kitchenaid", 
"openconcept", "size_master", "toilet", "washroom", "elizabeth", 
"equip_fit", "fireplac_live", "stair", "treed_outlook", "updat_bedroom"
)), row.names = c(NA, 20L), class = "data.frame")

回答

order每列按降序排列,然后从中选择前 5 名word

使用dplyr

library(dplyr)

data %>%
  summarise(across(starts_with('t_'), 
            ~word[head(order(., decreasing = TRUE), 5)]))

#              t_1              t_2              t_3           t_4
#1     pool_exercis    treed_outlook       heart_citi   size_master
#2        equip_fit    updat_bedroom      openconcept         stair
#3   public_librari    recent_upgrad        equip_fit       ceil_ga
#4       heart_citi       laundri_ga              hst fireplac_live
#5 fireplac_balconi fireplac_balconi fireplac_balconi   openconcept

#             t_5              t_6         t_7              t_8
#1      elizabeth        equip_fit rain_shower   public_librari
#2 public_librari       kitchenaid      toilet              hst
#3            hst fireplac_balconi  kitchenaid         washroom
#4       washroom              hst openconcept       laundri_ga
#5        ceil_ga   public_librari         hst fireplac_balconi

基础 R :

sapply(data[1:8], function(x) data$word[tail(order(x, decreasing = TRUE), 5)])

如果您可以使用长格式数据,则可以执行以下操作:

data %>%
  tidyr::pivot_longer(cols = starts_with('t_')) %>%
  group_by(name) %>%
  slice_max(value, n = 5) %>%
  ungroup


以上是使用dplyr在数据帧的每一列(对应的单词)中查找最大数的全部内容。
THE END
分享
二维码
< <上一篇
下一篇>>