即使缺少資料點，如何以特定順序（月年）顯示寬表？-有解無憂

    > df_1
    # A tibble: 47 x 3
    # Groups:   therapy_class [9]
       therapy_class             Year_month count
       <ord>                     <yearmon>  <int>
     1 ALK Inhibitors            Dec 2019      16
     2 ALK Inhibitors            Jan 2020      14
     3 ALK Inhibitors            Feb 2020      14
     4 ALK Inhibitors            Mar 2020      22
     5 ALK Inhibitors            Apr 2020      13
     6 ALK Inhibitors            May 2020      17
     7 Anti-VEGF-based therapies Dec 2019      33
     8 Anti-VEGF-based therapies Jan 2020      35
     9 Anti-VEGF-based therapies Feb 2020      36
    10 Anti-VEGF-based therapies Mar 2020      20
    # … with 37 more rows



    A tibble: 10 x 7
       therapy_class                    `Dec 2019`         `Jan 2020`         `Feb 2020`        `Mar 2020`        `Apr 2020`        `May 2020`       
       <ord>                            <chr>              <chr>              <chr>             <chr>             <chr>             <chr>            
     1 ALK Inhibitors                   "16 <br>[2.7%]"    "14 <br>[2.0%]"    "14 <br>[2.2%]"   "22 <br>[3.3%]"   "13 <br>[2.1%]"   "17 <br>[3.4%]"  
     2 Anti-VEGF-based therapies        "33 <br>[5.6%]"    "35 <br>[4.9%]"    "36 <br>[5.7%]"   "20 <br>[3.0%]"   "21 <br>[3.4%]"   "20 <br>[4.0%]"  
     3 EGFR TKIs                        "52 <br>[8.8%]"    "57 <br>[8.0%]"    "60 <br>[9.5%]"   "52 <br>[7.8%]"   "56 <br>[9.2%]"   "49 <br>[9.8%]"  
     4 EGFR-antibody based therapies    ""                 ""                 ""                ""                ""                ""               
     5 Non-platinum-based chemotherapy… "1 <br>[0.2%]"     "4 <br>[0.6%]"     "4 <br>[0.6%]"    ""                "1 <br>[0.2%]"    ""               
     6 IO-based therapies               "308 <br>[52.0%]"  "385 <br>[54.0%]"  "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]"
     7 Platinum-based chemotherapy com… "123 <br>[20.8%]"  "147 <br>[20.6%]"  "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]"
     8 Single agent chemotherapies      "29 <br>[4.9%]"    "33 <br>[4.6%]"    "17 <br>[2.7%]"   "28 <br>[4.2%]"   "25 <br>[4.1%]"   "22 <br>[4.4%]"  
     9 Other                            "30 <br>[5.1%]"    "38 <br>[5.3%]"    "42 <br>[6.7%]"   "33 <br>[4.9%]"   "31 <br>[5.1%]"   "21 <br>[4.2%]"  
    10 <strong>Total</strong>           "<strong>592</str… "<strong>713</str… "<strong>631</st… "<strong>668</st… "<strong>612</st… "<strong>501</st…


    > df_2
    # A tibble: 46 x 3
    # Groups:   therapy_class [9]
       therapy_class             Year_month count
       <ord>                     <yearmon>  <int>
     1 ALK Inhibitors            Dec 2019      16
     2 ALK Inhibitors            Feb 2020      14
     3 ALK Inhibitors            Mar 2020      22
     4 ALK Inhibitors            Apr 2020      13
     5 ALK Inhibitors            May 2020      17
     6 Anti-VEGF-based therapies Dec 2019      33
     7 Anti-VEGF-based therapies Jan 2020      35
     8 Anti-VEGF-based therapies Feb 2020      36
     9 Anti-VEGF-based therapies Mar 2020      20
    10 Anti-VEGF-based therapies Apr 2020      21
    # … with 36 more rows

> t2
# A tibble: 10 x 7
   therapy_class                    `Dec 2019`         `Feb 2020`         `Mar 2020`        `Apr 2020`        `May 2020`        `Jan 2020`       
   <ord>                            <chr>              <chr>              <chr>             <chr>             <chr>             <chr>            
 1 ALK Inhibitors                   "16 <br>[2.7%]"    "14 <br>[2.2%]"    "22 <br>[3.3%]"   "13 <br>[2.1%]"   "17 <br>[3.4%]"   ""               
 2 Anti-VEGF-based therapies        "33 <br>[5.6%]"    "36 <br>[5.7%]"    "20 <br>[3.0%]"   "21 <br>[3.4%]"   "20 <br>[4.0%]"   "35 <br>[5.0%]"  
 3 EGFR TKIs                        "52 <br>[8.8%]"    "60 <br>[9.5%]"    "52 <br>[7.8%]"   "56 <br>[9.2%]"   "49 <br>[9.8%]"   "57 <br>[8.2%]"  
 4 EGFR-antibody based therapies    ""                 ""                 ""                ""                ""                ""               
 5 Non-platinum-based chemotherapy… "1 <br>[0.2%]"     "4 <br>[0.6%]"     ""                "1 <br>[0.2%]"    ""                "4 <br>[0.6%]"   
 6 IO-based therapies               "308 <br>[52.0%]"  "330 <br>[52.3%]"  "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]" "385 <br>[55.1%]"
 7 Platinum-based chemotherapy com… "123 <br>[20.8%]"  "128 <br>[20.3%]"  "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]" "147 <br>[21.0%]"
 8 Single agent chemotherapies      "29 <br>[4.9%]"    "17 <br>[2.7%]"    "28 <br>[4.2%]"   "25 <br>[4.1%]"   "22 <br>[4.4%]"   "33 <br>[4.7%]"  
 9 Other                            "30 <br>[5.1%]"    "42 <br>[6.7%]"    "33 <br>[4.9%]"   "31 <br>[5.1%]"   "21 <br>[4.2%]"   "38 <br>[5.4%]"  
10 <strong>Total</strong>           "<strong>592</str… "<strong>631</str… "<strong>668</st… "<strong>612</st… "<strong>501</st… "<strong>699</st…
>

我正在嘗試從長表中創建一個包含計數和百分比的寬表。列是“月年”，需要按順序排列。我的問題是，當第一組（ALK 抑制劑）的某個“月年”缺少行時，列的順序就會被打亂。缺少的“月年”放在最后。長桌也不是固定桌。它是由用戶可以選擇月份年份范圍的功能生成的。所以 Year_month 列可以有任何范圍。

在這個例子中，我使用了 2019 年 12 月到 2020 年 5 月的 6 個月范圍。“df_1”有全部 6 個月，因此生成的寬表符合預期。“df_2”缺少 2020 年 1 月的 ALK 抑制劑。因此，結果表的末尾是“Jan 2020”。

這是我用來生成寬表的代碼：

df_2 %>%
  pivot_wider(names_from = Year_month, values_from = count) %>%
  ungroup() %>%
  mutate_at(.vars = vars(contains("20")), list(
    ~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]", 100 * (. / sum(., na.rm = TRUE)))))
  ))

這是示例資料 df_2

structure(list(therapy_class = structure(c(1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 
5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), Year_month = structure(c(2019.91666666667, 
2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333, 
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25, 
2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 
2020.25, 2020.33333333333, NA, 2019.91666666667, 2020, 2020.08333333333, 
2020.25, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 
2020.25, 2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 
2020.16666666667, 2020.25, 2020.33333333333, 2019.91666666667, 
2020, 2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333, 
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25, 
2020.33333333333), class = "yearmon"), count = c(16L, 14L, 22L, 
13L, 17L, 33L, 35L, 36L, 20L, 21L, 20L, 52L, 57L, 60L, 52L, 56L, 
49L, NA, 1L, 4L, 4L, 1L, 308L, 385L, 330L, 379L, 345L, 265L, 
123L, 147L, 128L, 134L, 120L, 107L, 29L, 33L, 17L, 28L, 25L, 
22L, 30L, 38L, 42L, 33L, 31L, 21L)), row.names = c(NA, -46L), groups = structure(list(
    therapy_class = structure(1:9, .Label = c("ALK Inhibitors", 
    "Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
    "Non-platinum-based chemotherapy combinations", "IO-based therapies", 
    "Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
    "Other"), class = c("ordered", "factor")), .rows = structure(list(
        1:5, 6:11, 12:17, 18L, 19:22, 23:28, 29:34, 35:40, 41:46), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

uj5u.com熱心網友回復：

一種選擇是complete在執行pivot_wider. 與pivot_wider，默認順序是根據訂單中的唯一值出現

library(dplyr)
library(tidyr)
library(zoo)
df_2 %>%
    ungroup %>% 
    mutate(Year_month = as.Date(Year_month)) %>% 
    complete(therapy_class, Year_month =  seq(from = min(Year_month, 
     na.rm = TRUE), to = max(Year_month, na.rm = TRUE),
       by = '1 month')) %>% 
    mutate(Year_month = as.yearmon(Year_month)) %>% 
    pivot_wider(names_from = Year_month, values_from = count) %>%
    ungroup() %>%
   mutate_at(.vars = vars(contains("20")),
     list(
    ~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]",
     100 * (. / sum(., na.rm = TRUE)))))
  ))

-輸出

# A tibble: 9 × 8
  therapy_class                                `Dec 2019`        `Jan 2020`        `Feb 2020`        `Mar 2020`        `Apr 2020`     `May 2020`     `NA`
  <ord>                                        <chr>             <chr>             <chr>             <chr>             <chr>          <chr>         <int>
1 ALK Inhibitors                               "16 <br>[2.7%]"   ""                "14 <br>[2.2%]"   "22 <br>[3.3%]"   "13 <br>[2.1%… "17 <br>[3.4…    NA
2 Anti-VEGF-based therapies                    "33 <br>[5.6%]"   "35 <br>[5.0%]"   "36 <br>[5.7%]"   "20 <br>[3.0%]"   "21 <br>[3.4%… "20 <br>[4.0…    NA
3 EGFR TKIs                                    "52 <br>[8.8%]"   "57 <br>[8.2%]"   "60 <br>[9.5%]"   "52 <br>[7.8%]"   "56 <br>[9.2%… "49 <br>[9.8…    NA
4 EGFR-antibody based therapies                ""                ""                ""                ""                ""             ""               NA
5 Non-platinum-based chemotherapy combinations "1 <br>[0.2%]"    "4 <br>[0.6%]"    "4 <br>[0.6%]"    ""                "1 <br>[0.2%]" ""               NA
6 IO-based therapies                           "308 <br>[52.0%]" "385 <br>[55.1%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.… "265 <br>[52…    NA
7 Platinum-based chemotherapy combinations     "123 <br>[20.8%]" "147 <br>[21.0%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.… "107 <br>[21…    NA
8 Single agent chemotherapies                  "29 <br>[4.9%]"   "33 <br>[4.7%]"   "17 <br>[2.7%]"   "28 <br>[4.2%]"   "25 <br>[4.1%… "22 <br>[4.4…    NA
9 Other                                        "30 <br>[5.1%]"   "38 <br>[5.4%]"   "42 <br>[6.7%]"   "33 <br>[4.9%]"   "31 <br>[5.1%… "21 <br>[4.2…    NA

轉載請註明出處，本文鏈接：https://www.uj5u.com/gongcheng/318521.html

標籤：r dplyr 缺失数据长桌年月

上一篇：有條件地計算資料幀中出現的次數-性能改進

下一篇：完成一些R特定計算后如何將向量從Python傳輸到R（并回傳）