diff --git a/01_introduction.Rmd b/01_introduction.Rmd index 68e597d..c5a4505 100644 --- a/01_introduction.Rmd +++ b/01_introduction.Rmd @@ -152,7 +152,7 @@ The "Conflicts" message is expected and can safely be ignored.^[It just means th We've never needed to use `filter` and `lag` from __stats__, but if you do, then use the double colon, i.e., `stats::filter()` or `stats::lag()`, as just `filter()` will use the __dplyr__ one.] There are a few other R packages that we use and are not part of the tidyverse, but we will introduce them as we go along. -If you're incredibly curious, head to the Resources section of the HealthyR website - up-to-date links and installation instructions are listed there. +If you're incredibly curious, head to the Resources section of the HealthyR website which is the best place to find up-to-date links and installation instructions. Our R and package versions are also listed in the Appendix. ## Getting help \index{help} diff --git a/03_summarising.Rmd b/03_summarising.Rmd index 3cd3be8..bceb49c 100755 --- a/03_summarising.Rmd +++ b/03_summarising.Rmd @@ -163,12 +163,13 @@ We can also further lines to `mutate()` to calculate the percentage of each grou ```{r, message = FALSE} # percent() function for formatting percentages come from library(scales) library(scales) -gbd2017 %>% +gbd2017_summarised <- gbd2017 %>% group_by(cause, sex) %>% summarise(deaths_per_group = sum(deaths_millions)) %>% ungroup() %>% mutate(deaths_total = sum(deaths_per_group), deaths_relative = percent(deaths_per_group/deaths_total)) +gbd2017_summarised ``` The `percent()` function comes from `library(scales)` and is a handy way of formatting percentages @@ -182,17 +183,13 @@ round(100*4.91/55.74, 1) %>% paste0("%") This is convenient for final presentation of number, but if you intend to do further calculations/plot/sort the percentages just calculate them as fractions with: -```{r, eval = FALSE} -gbd2017 %>% +```{r} +gbd2017_summarised %>% mutate(deaths_relative = deaths_per_group/deaths_total) ``` -and convert to nicely formatted percentages later: +and convert to nicely formatted percentages later with `mutate(deaths_percentage = percent(deaths_relative))`. -```{r, eval = FALSE} -gbd2017 %>% - mutate(deaths_percentage = percent(deaths_relative)) -``` ## `summarise()` vs `mutate()` diff --git a/04_plotting.Rmd b/04_plotting.Rmd index 241c40f..812f718 100644 --- a/04_plotting.Rmd +++ b/04_plotting.Rmd @@ -823,7 +823,7 @@ gapminder %>% Solution to Exercise \@ref(chap04-ex-barplot): -```{r, eval = FALSE} +```{r, fig.keep = 'none'} library(tidyverse) library(gapminder) diff --git a/06_working_continuous.Rmd b/06_working_continuous.Rmd index 4a497d5..fecc0a1 100644 --- a/06_working_continuous.Rmd +++ b/06_working_continuous.Rmd @@ -65,7 +65,7 @@ glimpse(gapdata) # each variable as line, variable type, first values missing_glimpse(gapdata) # missing data for each variable ``` -```{r eval = FALSE} +```{r results = "hide"} ff_glimpse(gapdata) # summary statistics for each variable ``` @@ -84,6 +84,10 @@ sum_gapdata[[1]] %>% linesep = c("", "", "\\addlinespace")) %>% kable_styling(latex_options = c("scale_down", "hold_position")) %>% column_spec(1, width = "4cm") +``` + + +```{r message=FALSE, echo=FALSE} sum_gapdata[[2]] %>% select(-c(5, 9)) %>% kable(row.names = FALSE, align = c("l", "l", "l", "r", "r", "r", "r", "r", "r", "r"), diff --git a/07_linear_regression.Rmd b/07_linear_regression.Rmd index 358b396..159d6ee 100644 --- a/07_linear_regression.Rmd +++ b/07_linear_regression.Rmd @@ -898,6 +898,7 @@ This great post demonstrates beautifully how the statistical tests we are most f Regression is fitting lines, preferably straight, through data points. Make $\hat{y} = \beta_0 + \beta_1 x_1$ a close friend. +An excellent book for further reading on regression is @harrell2015. ## Exercises diff --git a/09_logistic_regression.Rmd b/09_logistic_regression.Rmd index 8583754..8593b11 100644 --- a/09_logistic_regression.Rmd +++ b/09_logistic_regression.Rmd @@ -1146,7 +1146,7 @@ Make an odds ratio plot for this model. Solution to Exercise \@ref(chap09-ex1): -```{r eval=FALSE, fig.height=3, fig.width=5} +```{r fig.height=3, fig.width=5, fig.keep = 'none'} ## Recode melanoma <- melanoma %>% mutate(sex.factor = factor(sex) %>% @@ -1192,7 +1192,7 @@ p1 + p2 Solution to Exercise \@ref(chap09-ex2): -```{r eval=FALSE} +```{r message=FALSE, results = "hide"} ## Recode T-stage first melanoma <- melanoma %>% mutate( @@ -1216,7 +1216,7 @@ melanoma %>% Solution to Exercise \@ref(chap09-ex3): -```{r eval=FALSE} +```{r message=FALSE, warning=FALSE, results = "hide", fig.keep="none"} dependent = "mort_5yr" explanatory = c("sex.factor", "age", "t_stage.factor", "ulcer.factor") melanoma %>% @@ -1224,27 +1224,16 @@ melanoma %>% # c-statistic = 0.798 # In multivariable model, male vs female OR 1.26 (0.57-2.76, p=0.558). -# No relationship after accouting for T-stage and tumour ulceration. +# No relationship after accounting for T-stage and tumour ulceration. # Sex is confounded by these two variables. ``` Solution to Exercise \@ref(chap09-ex4): -```{r eval=FALSE} +```{r fig.keep="none", message=FALSE, warning=FALSE} dependent = "mort_5yr" explanatory = c("sex.factor", "age", "t_stage.factor", "ulcer.factor") melanoma %>% or_plot(dependent, explanatory) ``` - - - -```{r include=FALSE, eval = FALSE} -library(ggfortify) -dependent <- "mort_5yr" -explanatory_multi <- c("ulcer.factor", "t_stage.factor") -melanoma %>% - glmmulti(dependent, explanatory) %>% - autoplot(which=1:6) -``` diff --git a/10_survival.Rmd b/10_survival.Rmd index e8a6424..9852a0e 100644 --- a/10_survival.Rmd +++ b/10_survival.Rmd @@ -106,7 +106,7 @@ melanoma <- melanoma %>% \index{time-to-event / survival@\textbf{time-to-event / survival}!log-rank test} \index{log-rank test} -We will use the excellent **survival** package to produce the Kaplan Meier (KM) survival estimator. +We will use the excellent **survival** package to produce the Kaplan Meier (KM) survival estimator (@therneau2000, @therneau2020). This is a non-parametric statistic used to estimate the survival function from time-to-event data. ```{r} diff --git a/11_missing_data.Rmd b/11_missing_data.Rmd index 5d2e891..8f317cc 100644 --- a/11_missing_data.Rmd +++ b/11_missing_data.Rmd @@ -1,6 +1,6 @@ # (PART) Workflow {-} -Throughout this book we have tried to provide the most efficient approaches data analysis using R. +Throughout this book we have tried to provide the most efficient approaches to data analysis using R. In this section, we will provide workflows, or ways-of-working, which maximise efficiency, incorporate reporting of results within analyses, make exporting of tables and plots easy, and keep data safe, secured and backed up. We also include a section on dealing with missing data in R. Something that we both feel strongly about and which is often poorly described and dealt with in academic publishing. diff --git a/13_exporting.Rmd b/13_exporting.Rmd index 9f82ed7..3e35d3c 100644 --- a/13_exporting.Rmd +++ b/13_exporting.Rmd @@ -229,6 +229,16 @@ No problem, you update the dataset, re-run the script that created the tables an No more mindless re-doing for you. We think this is pretty amazing. +### Figure quality in Word output + +If your plots are looking a bit grainy in Word, include this in your setup chunk for high quality: + +```{r} +knitr::opts_chunk$set(dpi = 300) +``` + +The setup chunk is the one that starts with ```` ```{r setup, include = FALSE} ```` and is generated automatically when you create a new R Markdown document in RStudio. + ## Create Word template file To make sure tables always export with a suitable font size, you may edit your Word file but only to create a new template. diff --git a/16_appendix.Rmd b/16_appendix.Rmd new file mode 100644 index 0000000..974401a --- /dev/null +++ b/16_appendix.Rmd @@ -0,0 +1,43 @@ +# Appendix {-} + +```{r message=FALSE, warning=FALSE, echo = FALSE} +library(tidyverse) +library(knitr) +# all used packages: +system("grep '^library(' *Rmd > library_calls.txt") +used_packages_orig = read_table("library_calls.txt", col_names = "packages") + +used_packages = used_packages_orig %>% + separate(packages, into = c(NA, "packages"), sep = ":") %>% + separate(packages, into = c("packages", "comment"), sep = "#") %>% + mutate(packages = str_trim(packages)) %>% + distinct(packages) %>% + bind_rows(tibble(packages = c("tibble", "tidyr", "purrr", "stringr", "bookdown", "rmarkdown"))) %>% + mutate(packages = str_remove(packages, "library\\(") %>% str_remove("\\)") %>% + fct_relevel("tidyverse", + "ggplot2", + "tibble", + "tidyr", + "readr", + "purrr", + "dplyr", + "stringr", + "forcats", + "finalfit")) %>% + arrange(packages) %>% + pull(packages) + + +``` + + +This book was written in **bookdown**, which is an R package built on top of R Markdown (@xie2016). + +The main packages used in this book were: **`r combine_words(used_packages)`**. + +R and package versions, `sessionInfo()`: + +```{r, echo = FALSE} +xfun::session_info(used_packages, dependencies = FALSE) +``` + diff --git a/_bookdown.yml b/_bookdown.yml index 52c4308..a204909 100755 --- a/_bookdown.yml +++ b/_bookdown.yml @@ -10,7 +10,7 @@ language: chapter_name: "Chapter " new_session: yes output_dir: "docs" -#rmd_files: ["index.Rmd", "01_introduction.Rmd"] +#rmd_files: ["index.Rmd", "01_introduction.Rmd", "16_appendix.Rmd"] #rmd_files: ["index.Rmd", "02_basics.Rmd"] #rmd_files: ["index.Rmd", "03_summarising.Rmd"] #rmd_files: ["index.Rmd", "04_plotting.Rmd"] diff --git a/book.bib b/book.bib index b4ab9f1..145a76f 100755 --- a/book.bib +++ b/book.bib @@ -2,12 +2,19 @@ @Book{xie2015 title = {Dynamic Documents with {R} and knitr}, author = {Yihui Xie}, publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, year = {2015}, edition = {2nd}, note = {ISBN 978-1498716963}, url = {http://yihui.name/knitr/}, } +@Book{xie2016, + title = {bookdown: Authoring Books and Technical Documents with R Markdown}, + author = {Yihui Xie}, + publisher = {Chapman and Hall/CRC}, + year = {2016}, + note = {ISBN 978-1138700109}, + url = {https://bookdown.org/yihui/bookdown/}, +} @Manual{bryan2017, title = {gapminder: Data from Gapminder}, author = {Jennifer Bryan}, @@ -46,4 +53,35 @@ @article{brewer2003 publisher = {Taylor & Francis}, doi = {10.1179/000870403235002042} } +@Book{therneau2000, + title = {Modeling Survival Data: Extending the {C}ox Model}, + author = {{Terry M. Therneau} and {Patricia M. Grambsch}}, + year = {2000}, + publisher = {Springer}, + address = {New York}, + isbn = {0-387-98784-3}, + } +@Manual{therneau2020, + title = {A Package for Survival Analysis in R}, + author = {Terry M Therneau}, + year = {2020}, + note = {R package version 3.2-3}, + url = {https://CRAN.R-project.org/package=survival}, + } + +@book{harrell2015, + edition = {2}, + series = {Springer {Series} in {Statistics}}, + title = {Regression {Modeling} {Strategies}: {With} {Applications} to {Linear} {Models}, {Logistic} and {Ordinal} {Regression}, and {Survival} {Analysis}}, + isbn = {978-3-319-19424-0}, + shorttitle = {Regression {Modeling} {Strategies}}, + url = {https://www.springer.com/gp/book/9783319194240}, + abstract = {This highly anticipated second edition features new chapters and sections, 225 new references, and comprehensive R software. In keeping with the previous edition, this book is about the art and science of data analysis and predictive modelling, which entails choosing and using multiple tools. Instead of presenting isolated techniques, this text emphasises problem solving strategies that address the many issues arising when developing multi-variable models using real data and not standard textbook examples. Regression Modelling Strategies presents full-scale case studies of non-trivial data-sets instead of over-simplified illustrations of each method. These case studies use freely available R functions that make the multiple imputation, model building, validation and interpretation tasks described in the book relatively easy to do. Most of the methods in this text apply to all regression models, but special emphasis is given to multiple regression using generalised least squares for longitudinal data, the binary logistic model, models for ordinal responses, parametric survival regression models and the Cox semi parametric survival model. A new emphasis is given to the robust analysis of continuous dependent variables using ordinal regression.As in the first edition, this text is intended for Masters' or PhD. level graduate students who have had a general introductory probability and statistics course and who are well versed in ordinary multiple regression and intermediate algebra. The book will also serve as a reference for data analysts and statistical methodologists, as it contains an up-to-date survey and bibliography of modern statistical modelling techniques.}, + language = {en}, + urldate = {2020-08-14}, + publisher = {Springer International Publishing}, + author = {Harrell, Frank}, + year = {2015}, + doi = {10.1007/978-3-319-19425-7}, +} diff --git a/docs/healthyr-book.pdf b/docs/healthyr-book.pdf index f082d28..7a20b47 100644 Binary files a/docs/healthyr-book.pdf and b/docs/healthyr-book.pdf differ diff --git a/index.Rmd b/index.Rmd index 2304914..f8eb0ec 100755 --- a/index.Rmd +++ b/index.Rmd @@ -68,7 +68,7 @@ Nowhere more so than in the delivery of healthcare. From the understanding of disease and the development of new treatments, to the diagnosis and management of individual patients, the use of data and technology is now an integral part of the business of healthcare. Those working in healthcare interact daily with data, often without realising it. -The conversion of this avalanche of information to useful knowledge is essential for high quality patient care. +The conversion of this avalanche of information to useful knowledge is essential for high-quality patient care. An important part of this information revolution is the opportunity for everybody to become involved in data analysis. This democratisation is driven in part by the open source software movement – no longer do we require expensive specialised software to do this. @@ -84,8 +84,6 @@ We are grateful to the many individuals and students who have helped refine this Ewen Harrison and Riinu Pius -March 2020 - diff --git a/library_calls.txt b/library_calls.txt new file mode 100644 index 0000000..6a0d49c --- /dev/null +++ b/library_calls.txt @@ -0,0 +1,151 @@ +01_introduction.Rmd:library(tidyverse) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(kableExtra) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(patchwork) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(lubridate) # lubridate makes working with dates easier +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(tidyverse) +02_basics.Rmd:library(lubridate) +02_basics.Rmd:library(lubridate) +02_basics.Rmd:library(tidyverse) +03_summarising.Rmd:library(kableExtra) +03_summarising.Rmd:library(tidyverse) +03_summarising.Rmd:library(scales) +03_summarising.Rmd:library(scales) +03_summarising.Rmd:library(scales) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(gapminder) +04_plotting.Rmd:library(kableExtra) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(gapminder) +04_plotting.Rmd:library(patchwork) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(gapminder) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(gapminder) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(gapminder) +04_plotting.Rmd:library(tidyverse) +04_plotting.Rmd:library(gapminder) +05_fine_tuning_plots.Rmd:library(gapminder) +05_fine_tuning_plots.Rmd:library(tidyverse) +05_fine_tuning_plots.Rmd:library(patchwork) +06_working_continuous.Rmd:library(ggplot2) +06_working_continuous.Rmd:library(kableExtra) +06_working_continuous.Rmd:library(tidyverse) +06_working_continuous.Rmd:library(finalfit) +06_working_continuous.Rmd:library(gapminder) +06_working_continuous.Rmd:library(knitr) +06_working_continuous.Rmd:library(kableExtra) +06_working_continuous.Rmd:library(broom) +06_working_continuous.Rmd:library(broom) +06_working_continuous.Rmd:library(ggfortify) +06_working_continuous.Rmd:library(kableExtra) +06_working_continuous.Rmd:library(patchwork) # great for combining plots +06_working_continuous.Rmd:library(broom) +06_working_continuous.Rmd:library(patchwork) +06_working_continuous.Rmd:library(patchwork) +07_linear_regression.Rmd:library(knitr) +07_linear_regression.Rmd:library(kableExtra) +07_linear_regression.Rmd:library(ggplot2) +07_linear_regression.Rmd:library(tidyverse) +07_linear_regression.Rmd:library(gapminder) # dataset +07_linear_regression.Rmd:library(finalfit) +07_linear_regression.Rmd:library(broom) +07_linear_regression.Rmd:library(ggfortify) +07_linear_regression.Rmd:library(dplyr) +07_linear_regression.Rmd:library(finalfit) +07_linear_regression.Rmd:library(knitr) +08_working_categorical.Rmd:library(knitr) +08_working_categorical.Rmd:library(kableExtra) +08_working_categorical.Rmd:library(tidyverse) +08_working_categorical.Rmd:library(finalfit) +08_working_categorical.Rmd:library(patchwork) +08_working_categorical.Rmd:library(patchwork) +08_working_categorical.Rmd:library(finalfit) +08_working_categorical.Rmd:library(finalfit) +08_working_categorical.Rmd:library(finalfit) +08_working_categorical.Rmd:library(finalfit) +08_working_categorical.Rmd:library(magrittr) +08_working_categorical.Rmd:library(broom) +08_working_categorical.Rmd:library(finalfit) +08_working_categorical.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(knitr) +09_logistic_regression.Rmd:library(kableExtra) +09_logistic_regression.Rmd:library(ggplot2) +09_logistic_regression.Rmd:library(tidyverse) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(tidyverse) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(ggplot2) +09_logistic_regression.Rmd:library(patchwork) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(GGally) +09_logistic_regression.Rmd:library(broom) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(lme4) +09_logistic_regression.Rmd:library(finalfit) +09_logistic_regression.Rmd:library(finalfit) +10_survival.Rmd:library(knitr) +10_survival.Rmd:library(kableExtra) +10_survival.Rmd:library(tidyverse) +10_survival.Rmd:library(finalfit) +10_survival.Rmd:library(dplyr) +10_survival.Rmd:library(forcats) +10_survival.Rmd:library(survival) +10_survival.Rmd:library(survival) +10_survival.Rmd:library(ggplot2) +10_survival.Rmd:library(lubridate) +10_survival.Rmd:library(survminer) +11_missing_data.Rmd:library(knitr) +11_missing_data.Rmd:library(kableExtra) +11_missing_data.Rmd:library(finalfit) +11_missing_data.Rmd:library(dplyr) +11_missing_data.Rmd:library(MissMech) +11_missing_data.Rmd:library(finalfit) +11_missing_data.Rmd:library(dplyr) +11_missing_data.Rmd:library(mice) +11_missing_data.Rmd:library(dplyr) +12_notebooks.Rmd:library(dplyr) +12_notebooks.Rmd:library(readr) +12_notebooks.Rmd:library(tidyverse) +13_exporting.Rmd:library(knitr) +13_exporting.Rmd:library(kableExtra) +13_exporting.Rmd:library(tidyverse) +13_exporting.Rmd:library(finalfit) +13_exporting.Rmd:library(tidyverse) +13_exporting.Rmd:library(finalfit) +13_exporting.Rmd:library(finalfit) +13_exporting.Rmd:library(dplyr) +13_exporting.Rmd:library(knitr) +13_exporting.Rmd:library(finalfit) +13_exporting.Rmd:library(dplyr) +13_exporting.Rmd:library(knitr) +13_exporting.Rmd:library(finalfit) +13_exporting.Rmd:library(dplyr) +13_exporting.Rmd:library(knitr) +13_exporting.Rmd:library(kableExtra) +15_encryption.Rmd:library(encryptr) +15_encryption.Rmd:library(dplyr) +16_appendix.Rmd:library(tidyverse) +16_appendix.Rmd:library(knitr)