maditr: Fast Data Aggregation, Modification, and Filtering

Links

Installation

maditr is on CRAN, so for installation you can print in the console install.packages("maditr").

Overview

Package provides pipe-style interface for data.table package. It preserves all data.table features without significant impact on performance. let and take functions are simplified interfaces for most common data manipulation tasks.

To select rows from data: rows(mtcars, am==0)
To select columns from data: columns(mtcars, mpg, vs:carb)
To aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)
To aggregate all non-grouping columns: take_all(mtcars, mean, by = am)
To aggregate several columns with one summary: take(mtcars, mpg, hp, fun = mean, by = am)
To get total summary skip by argument: take_all(mtcars, mean)
Use magrittr pipe %>% to chain several operations:

     mtcars %>%
        let(mpg_hp = mpg/hp) %>%
        take(mean(mpg_hp), by = am)

To modify variables or add new variables:

      mtcars %>%
         let(new_var = 42,
             new_var2 = new_var*hp) %>%
         head()

To drop variable assign NULL: let(mtcars, am = NULL) %>% head()
To modify all non-grouping variables:

    iris %>%
      let_all(
          scaled = (.x - mean(.x))/sd(.x),
          by = Species) %>%
       head()

To aggregate all variables conditionally on name:

    iris %>%
      take_all(
          mean = if(startsWith(.name, "Sepal")) mean(.x),
          median = if(startsWith(.name, "Petal")) median(.x),
          by = Species
      )

For parametric assignment use :=:

    new_var = "my_var"
    old_var = "mpg"
    mtcars %>%
        let((new_var) := get(old_var)*2) %>%
        head()
     
    # or,  
    expr = quote(mean(cyl))
    mtcars %>% 
        let((new_var) := eval(expr)) %>% 
        head()
    
    # the same with `take` 
    by_var = "vs,am"
    take(mtcars, (new_var) := eval(expr), by = by_var)

query_if function translates its arguments one-to-one to [.data.table method. Additionally there are some conveniences such as automatic data.frame conversion to data.table.

vlookup & xlookup

Let's make datasets for lookups:

library(maditr)


workers = fread("
    name company
    Nick Acme
    John Ajax
    Daniela Ajax
")

positions = fread("
    name position
    John designer
    Daniela engineer
    Cathie manager
")

# xlookup
workers = let(workers,
  position = xlookup(name, positions$name, positions$position)
)

# vlookup
# by default we search in the first column and return values from second column
workers = let(workers,
  position = vlookup(name, positions, no_match = "Not found")
)

# the same 
workers = let(workers,
  position = vlookup(name, positions, 
                     result_column = "position", 
                     no_match = "Not found") # or, result_column = 2 
)

head(workers)

More examples

We will use for demonstartion well-known mtcars dataset and some examples from dplyr package.

library(maditr)

data(mtcars)

# Newly created variables are available immediately
mtcars %>%
    let(
        cyl2 = cyl * 2,
        cyl4 = cyl2 * 2
    ) %>% head()

# You can also use let() to remove variables and
# modify existing variables
mtcars %>%
    let(
        mpg = NULL,
        disp = disp * 0.0163871 # convert to litres
    ) %>% head()


# window functions are useful for grouped computations
mtcars %>%
    let(rank = rank(-mpg, ties.method = "min"),
        by = cyl) %>%
    head()

# You can drop variables by setting them to NULL
mtcars %>%
    let(cyl = NULL) %>%
    head()

# keeps all existing variables
mtcars %>%
    let(displ_l = disp / 61.0237) %>%
    head()

# keeps only the variables you create
mtcars %>%
    take(displ_l = disp / 61.0237) %>% 
    head()


# can refer to both contextual variables and variable names:
var = 100
mtcars %>%
    let(cyl = cyl * var) %>%
    head()

# select rows
mtcars %>%
    rows(am==0) %>% 
    head()

# select rows with compound condition
mtcars %>%
    rows(am==0 & mpg>mean(mpg))

# select columns
mtcars %>% 
    columns(vs:carb, cyl)
    
mtcars %>% 
    columns(-am, -cyl)    

# regular expression pattern
columns(iris, "^Petal") # variables which start from 'Petal'
columns(iris, "Width$") # variables which end with 'Width'

# move Species variable to the front
# pattern "^." matches all variables
columns(iris, Species, "^.")

# pattern "^.*al" means "contains 'al'"
columns(iris, "^.*al")

# numeric indexing - all variables except Species
columns(iris, 1:4) 

# A 'take' with summary functions applied without 'by' argument returns an aggregated data
mtcars %>%
    take(mean = mean(disp), n = .N)

# Usually, you'll want to group first
mtcars %>%
    take(mean = mean(disp), n = .N, by = am)

# grouping by multiple variables
mtcars %>%
    take(mean = mean(disp), n = .N, by = list(am, vs))

# You can group by expressions:
mtcars %>%
    take_all(
        mean,
        by = list(vsam = vs + am)
    )

# modify all non-grouping variables in-place
mtcars %>%
    let_all((.x - mean(.x))/sd(.x), by = am) %>%
    head()

# modify all non-grouping variables to new variables
mtcars %>%
    let_all(scaled = (.x - mean(.x))/sd(.x), by = am) %>%
    head()

# conditionally modify all variables
iris %>%
    let_all(mean = if(is.numeric(.x)) mean(.x)) %>%
    head()

# modify all variables conditionally on name
iris %>%
    let_all(
        mean = if(startsWith(.name, "Sepal")) mean(.x),
        median = if(startsWith(.name, "Petal")) median(.x),
        by = Species
    ) %>%
    head()

# aggregation with 'take_all'
mtcars %>%
    take_all(mean = mean(.x), sd = sd(.x), n = .N, by = am)

# conditionally aggregate all variables
iris %>%
    take_all(mean = if(is.numeric(.x)) mean(.x))

# aggregate all variables conditionally on name
iris %>%
    take_all(
        mean = if(startsWith(.name, "Sepal")) mean(.x),
        median = if(startsWith(.name, "Petal")) median(.x),
        by = Species
    )

# parametric evaluation:
var = quote(mean(cyl))
mtcars %>% 
    let(mean_cyl = eval(var)) %>% 
    head()
take(mtcars, eval(var))

# all together
new_var = "mean_cyl"
mtcars %>% 
    let((new_var) := eval(var)) %>% 
    head()
take(mtcars, (new_var) := eval(var))

Variable selection in the expressions

You can use 'columns' inside expression in the 'take'/'let'. 'columns' will be replaced with data.table with selected columns. In 'let' in the expressions with ':=', 'cols' or '%to%' can be placed in the left part of the expression. It is usefull for multiple assignment. There are four ways of column selection:

Simply by column names
By variable ranges, e. g. vs:carb. Alternatively, you can use '%to%' instead of colon: 'vs %to% carb'.
With regular expressions. Characters which start with '^' or end with $ considered as Perl-style regular expression patterns. For example, '^Petal' returns all variables started with 'Petal'. 'Width$' returns all variables which end with 'Width'. Pattern '^.' matches all variables and pattern '^.*my_str' is equivalent to contains "my_str"'.
By character variables with interpolated parts. Expression in the curly brackets inside characters will be evaluated in the parent frame with 'text_expand' function. For example, a{1:3} will be transformed to the names 'a1', 'a2', 'a3'. 'cols' is just a shortcut for 'columns'.

# range selection
iris %>% 
    let(
        avg = rowMeans(Sepal.Length %to% Petal.Width)
    ) %>% 
    head()

# multiassignment
iris %>% 
    let(
        # starts with Sepal or Petal
        multipled1 %to% multipled4 := cols("^(Sepal|Petal)")*2
    ) %>% 
    head()


mtcars %>% 
    let(
        # text expansion
        cols("scaled_{names(mtcars)}") := lapply(cols("{names(mtcars)}"), scale)
    ) %>% 
    head()

# range selection in 'by'
# selection of range + additional column
mtcars %>% 
    take(
        res = sum(cols(mpg, disp %to% drat)),
        by = vs %to% gear
    )

Joins

Here we use the same datasets as with lookups:

workers = fread("
    name company
    Nick Acme
    John Ajax
    Daniela Ajax
")

positions = fread("
    name position
    John designer
    Daniela engineer
    Cathie manager
")

workers
positions

Different kinds of joins:

workers %>% dt_inner_join(positions)
workers %>% dt_left_join(positions)
workers %>% dt_right_join(positions)
workers %>% dt_full_join(positions)

# filtering joins
workers %>% dt_anti_join(positions)
workers %>% dt_semi_join(positions)

To suppress the message, supply by argument:

workers %>% dt_left_join(positions, by = "name")

Use a named by if the join variables have different names:

positions2 = setNames(positions, c("worker", "position")) # rename first column in 'positions'
workers %>% dt_inner_join(positions2, by = c("name" = "worker"))

'dplyr'-like interface for data.table.

There are a small subset of 'dplyr' verbs to work with data.table. Note that there is no group_by verb - use by or keyby argument when needed.

dt_mutate adds new variables or modify existing variables. If data is data.table then it modifies in-place.
dt_summarize computes summary statistics. Splits the data into subsets, computes summary statistics for each, and returns the result in the "data.table" form.
dt_summarize_all the same as dt_summarize but work over all non-grouping variables.
dt_filter Selects rows/cases where conditions are true. Rows where the condition evaluates to NA are dropped.
dt_select Selects column/variables from the data set. Range of variables are supported, e. g. vs:carb. Characters which start with ^ or end with \$ considered as Perl-style regular expression patterns. For example, '^Petal' returns all variables started with 'Petal'. 'Width\$' returns all variables which end with 'Width'. Pattern ^. matches all variables and pattern '^.*my_str' is equivalent to contains "my_str". See examples.

# examples from 'dplyr'
# newly created variables are available immediately
mtcars  %>%
    dt_mutate(
        cyl2 = cyl * 2,
        cyl4 = cyl2 * 2
    ) %>%
    head()


# you can also use dt_mutate() to remove variables and
# modify existing variables
mtcars %>%
    dt_mutate(
        mpg = NULL,
        disp = disp * 0.0163871 # convert to litres
    ) %>%
    head()


# window functions are useful for grouped mutates
mtcars %>%
    dt_mutate(
        rank = rank(-mpg, ties.method = "min"),
        keyby = cyl) %>%
    print()


# You can drop variables by setting them to NULL
mtcars %>% dt_mutate(cyl = NULL) %>% head()

# A summary applied without by returns a single row
mtcars %>%
    dt_summarise(mean = mean(disp), n = .N)

# Usually, you'll want to group first
mtcars %>%
    dt_summarise(mean = mean(disp), n = .N, by = cyl)


# Multiple 'by' - variables
mtcars %>%
    dt_summarise(cyl_n = .N, by = list(cyl, vs))

# Newly created summaries immediately
# doesn't overwrite existing variables
mtcars %>%
    dt_summarise(disp = mean(disp),
                  sd = sd(disp),
                  by = cyl)

# You can group by expressions:
mtcars %>%
    dt_summarise_all(mean, by = list(vsam = vs + am))

# filter by condition
mtcars %>%
    dt_filter(am==0)

# filter by compound condition
mtcars %>%
    dt_filter(am==0,  mpg>mean(mpg))


# select
mtcars %>% dt_select(vs:carb, cyl)
mtcars %>% dt_select(-am, -cyl)

# regular expression pattern
dt_select(iris, "^Petal") # variables which start from 'Petal'
dt_select(iris, "Width$") # variables which end with 'Width'
# move Species variable to the front
# pattern "^." matches all variables
dt_select(iris, Species, "^.")
# pattern "^.*al" means "contains 'al'"
dt_select(iris, "^.*al")
dt_select(iris, 1:4) # numeric indexing - all variables except Species

Name		Name	Last commit message	Last commit date
Latest commit History 208 Commits
R		R
inst/tinytest		inst/tinytest
man		man
tests		tests
vignettes		vignettes
.Rbuildignore		.Rbuildignore
.gitignore		.gitignore
.travis.yml		.travis.yml
DESCRIPTION		DESCRIPTION
NAMESPACE		NAMESPACE
NEWS		NEWS
README.MD		README.MD
codecov.yml		codecov.yml
maditr.Rproj		maditr.Rproj

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

maditr: Fast Data Aggregation, Modification, and Filtering

Links

Installation

Overview

vlookup & xlookup

More examples

Variable selection in the expressions

Joins

'dplyr'-like interface for data.table.

About

Releases 13

Packages

Contributors 3

Languages

gdemin/maditr

Folders and files

Latest commit

History

Repository files navigation

maditr: Fast Data Aggregation, Modification, and Filtering

Links

Installation

Overview

vlookup & xlookup

More examples

Variable selection in the expressions

Joins

'dplyr'-like interface for data.table.

About

Topics

Resources

Stars

Watchers

Forks

Releases 13

Packages 0

Contributors 3

Languages

Packages