解析一列json并与另一列绑定以制作数据框

html5 • 2022年9月21日 pm2:36 • 问答

我有采用以下格式的数据：

have <- structure(list(V1 = c(4L, 28L, 2L), 
                       V2 = c("[{"group":1,"topic":"A"},{"group":1,"topic":"B"},{"group":2,"topic":"C"},{"group":2,"topic":"T"},{"group":2,"topic":"U"},{"group":3,"topic":"V"},{"group":3,"topic":"D"},{"group":3,"topic":"R"},{"group":4,"topic":"A"},{"group":4,"topic":"Q"},{"group":4,"topic":"S"},{"group":4,"topic":"W"},{"group":6,"topic":"O"},{"group":6,"topic":"P"},{"group":6,"topic":"E"},{"group":6,"topic":"F"},{"group":6,"topic":"G"},{"group":6,"topic":"H"},{"group":6,"topic":"I"},{"group":6,"topic":"J"},{"group":6,"topic":"K"},{"group":6,"topic":"L"},{"group":6,"topic":"M"},{"group":6,"topic":"N"}]",
                              "[]", 
                              "[{"group":2,"topic":"C"},{"group":3,"topic":"D"},{"group":6,"topic":"O"},{"group":6,"topic":"P"},{"group":6,"topic":"E"},{"group":6,"topic":"G"},{"group":6,"topic":"M"}]")
), 
row.names = c(NA, 3L), 
class = "data.frame")

的内容V2是每一行的嵌套分组，如[{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]

我想获得一个宽数据框，其中also_have每行的 group+topic 的每个组合都有一个指示符 (1/0)（请参阅参考资料）。像这样的东西：

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

第一步是解析json。

我可以使用purrr::map(have$V2, jsonlite::fromJSON)取消嵌套到列表中，但我不确定如何将V1列（我们可能会重命名为id）绑定到结果列表的每个元素（请注意，列表元素二是空的，因为它V1==28是空的）。下面是添加了id( V1)的第一个元素可能是什么样子的片段。

[[1]]
   group topic  id
1      1     A   4
2      1     B   4
3      2     C   4
4      2     T   4
...

或者，我认为purrr::map_df(have$V2, jsonlite::fromJSON)会让我更接近我最终需要的东西，但在这里我也不确定如何添加行id( V1)。

df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df

What I get:

   group topic
1      1     A
2      1     B
3      2     C
4      2     T
...

What I want (notice `V1==28` does not appear): 

   group topic id
1      1     A  4
2      1     B  4
3      2     C  4
4      2     T  4
5      2     U  4
6      3     V  4
7      3     D  4
8      3     R  4
9      4     A  4
10     4     Q  4
11     4     S  4
12     4     W  4
13     6     O  4
14     6     P  4
15     6     E  4
16     6     F  4
17     6     G  4
18     6     H  4
19     6     I  4
20     6     J  4
21     6     K  4
22     6     L  4
23     6     M  4
24     6     N  4
25     2     C  2
26     3     D  2
27     6     O  2
28     6     P  2
29     6     E  2
30     6     G  2
31     6     M  2

停止。

我想如果我能得到上面的数据框，id我就可以完成剩下的工作。最终目标是加入这些信息，also_have然后广泛使用。

# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
  mutate(topic_id = 1:n()) %>%
  magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
  select(topic_id, group, topic)

# pivot wide

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

更新：

应用@akrun 的解决方案：

purrr::map_dfr(setNames(have$V2, have$V1), 
                     jsonlite::fromJSON, 
                     .id = 'V1') %>%
  rename(id = V1) %>%
  left_join(also_have, by=c("group", "topic")) %>%
  select(-group, -topic) %>%
  mutate(value = 1) %>%
  pivot_wider(id_cols = id, 
              names_from = topic_id,
              names_prefix = "topic_id",
              values_from = value,
              values_fill = 0
  ) %>% 
  full_join(tibble(id = as.character(have$V1))) %>%
  replace(is.na(.), 0)

# A tibble: 3 x 25
  id    topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
  <chr>     <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
1 4             1         1          1          1          1          1          1
2 2             0         0          1          0          0          0          1
3 28            0         0          0          0          0          0          0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
#   topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
#   topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
#   topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
#   topic_id143 <dbl>, topic_id144 <dbl>

回答

我们可以传递一个命名向量，然后.id在map_dfr

 purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')

-输出

id group topic
1   4     1     A
2   4     1     B
3   4     2     C
4   4     2     T
5   4     2     U
6   4     3     V
7   4     3     D
8   4     3     R
9   4     4     A
10  4     4     Q
11  4     4     S
12  4     4     W
...

或者这可以dplyr在使用后在框架本身内完成rowwise

library(tidyr)
have %>%
    rowwise %>%
    transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
    ungroup %>%
    unnest(c(V2), keep_empty = TRUE) %>% 
    select(-V2)
# A tibble: 32 x 3
      ID group topic
   <int> <int> <chr>
 1     4     1 A    
 2     4     1 B    
 3     4     2 C    
 4     4     2 T    
 5     4     2 U    
 6     4     3 V    
 7     4     3 D    
 8     4     3 R    
 9     4     4 A    
10     4     4 Q    
# … with 22 more rows

第二步做一个连接

out <-  have %>%
     rowwise %>%
     transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
     ungroup %>%
     unnest(c(V2), keep_empty = TRUE) %>% 
     select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
      ID group topic topic_id
   <int> <int> <chr>    <int>
 1     4     1 A            1
 2     4     1 B            2
 3     4     2 C           29
 4     4     2 T           46
 5     4     2 U           47
 6     4     3 V           74
 7     4     3 D           56
 8     4     3 R           70
 9     4     4 A           79
10     4     4 Q           95
# … with 153 more rows

以上是解析一列json并与另一列绑定以制作数据框的全部内容。

THE END

二维码

如何停止C++20std::chronozoned_time打印GMT-4而不是EST？

< <上一篇

如何从具有相同前缀的.y中减去多个.x

下一篇>>

搜索内容

解析一列json并与另一列绑定以制作数据框

回答

目录

目录

推荐文章

最新文章