医師が教えるR言語での医療データ分析入門-2022年改訂 - 175 全体集計の実践(tidyverse)応用3-across

library(tidyverse)

それでは、スライドで解説したacross関数を、実際にRで実行してみます。

test <- tibble(
  a1 = c(12,13,14),
  b1 = c(1,2,3),
  a2 = c(3.22345, 3.542352345, 4.4235235)
)

test

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1    12     1  3.22
2    13     2  3.54
3    14     3  4.42

こんなデータがあるとして、

全部の列を100倍する:

test %>% mutate( across(.cols=everything(), .fns = ~{. * 100}) )

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1  1200   100  322.
2  1300   200  354.
3  1400   300  442.

どうでしょうか?mutate関数の中で、acrossを利用して、そのacrossのargumentsに、.colsはeverything()、　.fnsは~{}で無名関数を作成して、その中身として「. * 100」が入れてある構造、理解できますか？

この書き方、argumentの順番さえ間違えなければ。.colsや.fnsを書く必要はないので、

test %>% 
  mutate(across( everything(), ~{.*100}))

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1  1200   100  322.
2  1300   200  354.
3  1400   300  442.

と書いてもOKです。

では、.colsで列の選択がtidy-selectを利用できることを確認してみましょう。

test %>% 
  mutate(across(starts_with("a"), ~{.*100}))

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1  1200     1  322.
2  1300     2  354.
3  1400     3  442.

statrt_withで、ちゃんと、aで始まる列だけ100倍されていますね?

他にも、

test %>% 
  mutate(across(c(a1,b1), ~{.*100}))

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1  1200   100  3.22
2  1300   200  3.54
3  1400   300  4.42

test %>% 
  mutate(across(!b1, ~{.*100}))

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1  1200     1  322.
2  1300     2  354.
3  1400     3  442.

test %>% 
  mutate(across(ends_with("2"), ~{.*100}))

# A tibble: 3 × 3
     a1    b1    a2
  <dbl> <dbl> <dbl>
1    12     1  322.
2    13     2  354.
3    14     3  442.

等も、なにをしているかお判りでしょうか?

それでは、次に、もともととりくんでいたデータの集計について、

id <- 1:15
age <- c(30,40,65,34,86,43,64,26,87,45,76,24,97,45,34)
gender <- c("m","m","f","f","f","m","m","f","f","m","f","f","m","m","m")
isx <- c(F,T,F,F,T,T,T,F,T,F,T,F,F,F,T)

tibble(id     = id, 
       age    = age, 
       gender = gender, 
       isx    = isx   ) %>% 
  summarise(
    age_mean = mean(age),
    age_min = min(age),
    age_max = max(age),
    gender_male_n = sum(gender=="m"),
    gender_male_p = 100*(gender_male_n/n())
  ) %>% 
  mutate(
    age_mean = format(round(age_mean,2),nsmall=2),
    age_min  = format(round(age_min ,2),nsmall=2),
    age_max  = format(round(age_max ,2),nsmall=2),
    gender_male_p = format(round(gender_male_p,2),nsmall=2),
    gender_male_n = format(round(gender_male_n,2),nsmall=2)
  ) %>% 
  mutate(
    `年齢:平均(最小-最大)` = str_c(age_mean,"(",age_min,"-",age_max,")"),
    `性別:男性 人数(%)` = str_c(gender_male_n,"(", gender_male_p,"%)")
  ) %>% 
  select(matches("^年齢|^性別")) %>% 
  pivot_longer(cols = everything(),names_to = " ", values_to = "合計")

# A tibble: 2 × 2
  ` `                  合計              
  <chr>                <chr>             
1 年齢:平均(最小-最大) 53.07(24.00-97.00)
2 性別:男性 人数(%)    8.00(53.33%)

この処理、acrossを利用して、少し簡潔に書いてみてください？

動画をとめてとりくんでみてください。

できましたか?

こう書けます。

tibble(id     = id, 
       age    = age, 
       gender = gender, 
       isx    = isx   ) %>% 
  summarise(
    age_mean = mean(age),
    age_min = min(age),
    age_max = max(age),
    gender_male_n = sum(gender=="m"),
    gender_male_p = 100*(gender_male_n/n())
  ) %>% 
  mutate(
    across( everything(), ~{format(round(.,2),nsmall=2)})
  ) %>%  
  mutate(
    `年齢:平均(最小-最大)` = str_c(age_mean,"(",age_min,"-",age_max,")"),
    `性別:男性 人数(%)` = str_c(gender_male_n,"(", gender_male_p,"%)")
  ) %>% 
  select(matches("^年齢|^性別"))  %>% 
  pivot_longer(cols = everything(),names_to = " ", values_to = "合計")

# A tibble: 2 × 2
  ` `                  合計              
  <chr>                <chr>             
1 年齢:平均(最小-最大) 53.07(24.00-97.00)
2 性別:男性 人数(%)    8.00(53.33%)

いかがでしょうか?

mutate(
  age_mean = format(round(age_mean,2),nsmall=2),
  age_min  = format(round(age_min ,2),nsmall=2),
  age_max  = format(round(age_max ,2),nsmall=2),
  gender_male_p = format(round(gender_male_p,2),nsmall=2),
  gender_male_n = format(round(gender_male_n,2),nsmall=2)
)

の部分が、

mutate(across( everything(), ~{format(round(.,2),nsmall=2)}))

たったこれだけになりました。

特に列数が多い場合に、有効な方法になりますので、使えそうであれば、使ってみてください。

以上、単純集計の話でした。

次からは、いよいよ集団で集計する話に進んでいきます。