Mutate Partitioner

John Mount, Win-Vector LLC

2021-09-01

seplyr::partition_mutate_qt() is a service supplied by the package seplyr (version 0.5.0 or higher).

seplyr::partition_mutate_qt() can partition a sequence of assignments so that no statement is using any value created in the same partition element or group. The partitions are in a format accepted by seplyr::mutate_se() for execution.

For such a partition the evaluation result does not depend on the order of execution of the statements in each group (as they are all independent of each other’s left-hand-sides). A no-dependency small number of groups partition is very helpful when executing expressions on SQL based data interfaces (such as Apache Spark).

The method used to partition expressions is to scan the remaining expressions in order taking any that: have all their values available from earlier groups, do not use a value formed in the current group, and do not overwrite a value formed in the current group.

This partitioning method can lead to far fewer groups than the straightforward method of breaking up the sequence of expressions at each new-value use.

Here is a non-trivial example (notice we use := for assignment, that is a requirement of seplyr::mutate_se()):

library("seplyr")
#> Loading required package: wrapr

plan <- partition_mutate_qt(
  rand_a := rand(),
   choice_a := rand_a>=0.5, # first use of a new value 1
    a_1 := ifelse(choice_a, # first use of a new value 2
                  'treatment', 
                  'control'),
    a_2 := ifelse(choice_a, 
                  'control', 
                  'treatment'),
  rand_b := rand(),
   choice_b := rand_b>=0.5, # first use of a new value 3
    b_1 := ifelse(choice_b, # first use of a new value 4
                  'treatment', 
                  'control'),
    b_2 := ifelse(choice_b, 
                  'control', 
                  'treatment'),
  rand_c := rand(),
   choice_c := rand_c>=0.5, # first use of a new value 5
    c_1 := ifelse(choice_c, # first use of a new value 6
                  'treatment', 
                  'control'),
    c_2 := ifelse(choice_c, 
                  'control', 
                  'treatment'),
  rand_d := rand(),
   choice_d := rand_d>=0.5, # first use of a new value 7
    d_1 := ifelse(choice_d, # first use of a new value 8
                  'treatment', 
                  'control'),
    d_2 := ifelse(choice_d, 
                  'control', 
                  'treatment'),
  rand_e := rand(),
   choice_e := rand_e>=0.5, # first use of a new value 9
    e_1 := ifelse(choice_e, # first use of a new value 10
                  'treatment', 
                  'control'),
    e_2 := ifelse(choice_e, 
                  'control', 
                  'treatment')
  )

print(plan)
#> $group00001
#>   rand_a   rand_b   rand_c   rand_d   rand_e 
#> "rand()" "rand()" "rand()" "rand()" "rand()" 
#> 
#> $group00002
#>        choice_a        choice_b        choice_c        choice_d        choice_e 
#> "rand_a >= 0.5" "rand_b >= 0.5" "rand_c >= 0.5" "rand_d >= 0.5" "rand_e >= 0.5" 
#> 
#> $group00003
#>                                            a_1 
#> "ifelse(choice_a, \"treatment\", \"control\")" 
#>                                            a_2 
#> "ifelse(choice_a, \"control\", \"treatment\")" 
#>                                            b_1 
#> "ifelse(choice_b, \"treatment\", \"control\")" 
#>                                            b_2 
#> "ifelse(choice_b, \"control\", \"treatment\")" 
#>                                            c_1 
#> "ifelse(choice_c, \"treatment\", \"control\")" 
#>                                            c_2 
#> "ifelse(choice_c, \"control\", \"treatment\")" 
#>                                            d_1 
#> "ifelse(choice_d, \"treatment\", \"control\")" 
#>                                            d_2 
#> "ifelse(choice_d, \"control\", \"treatment\")" 
#>                                            e_1 
#> "ifelse(choice_e, \"treatment\", \"control\")" 
#>                                            e_2 
#> "ifelse(choice_e, \"control\", \"treatment\")"

Notice seplyr::partition_mutate_qt() split the work into 3 groups. The straightforward method (with no statement re-ordering) of splitting into non-dependent groups would have to split the mutate at each first use of a new value: yielding 10 splits or 11 mutate stages. For why a low number of execution stages is important please see here.

To execute the statements on a data-item “d” (either an in-memory data.frame or a remote database or Sparklyr handle) we would do something like the following:

res <- mutate_seb(d, plan)

A fully worked version of this example can be found here.

Note re-using variable values does limit the planner’s ability to efficiently partition the the statement. The planner still emits safe and correct code, but unless it were to be allowed to introduce new variable names it must break sequences in more places (note prior to seplyr version 0.5.2 some dependencies were missed, leading to possibly unsafe code). We show this effect below.

plan <- partition_mutate_qt(
  rand := rand(),
   choice := rand>=0.5, # first use of a new value 1
    a_1 := ifelse(choice, # first use of a new value 2
                  'treatment', 
                  'control'),
    a_2 := ifelse(choice, 
                  'control', 
                  'treatment'),
  rand := rand(),
   choice := rand>=0.5, # first use of a new value 3
    b_1 := ifelse(choice, # first use of a new value 4
                  'treatment', 
                  'control'),
    b_2 := ifelse(choice, 
                  'control', 
                  'treatment'),
  rand := rand(),
   choice := rand>=0.5, # first use of a new value 5
    c_1 := ifelse(choice, # first use of a new value 6
                  'treatment', 
                  'control'),
    c_2 := ifelse(choice, 
                  'control', 
                  'treatment'),
  rand := rand(),
   choice := rand>=0.5, # first use of a new value 7
    d_1 := ifelse(choice, # first use of a new value 8
                  'treatment', 
                  'control'),
    d_2 := ifelse(choice, 
                  'control', 
                  'treatment'),
  rand := rand(),
   choice := rand>=0.5, # first use of a new value 9
    e_1 := ifelse(choice, # first use of a new value 10
                  'treatment', 
                  'control'),
    e_2 := ifelse(choice, 
                  'control', 
                  'treatment')
  )

print(plan)
#> $group00001
#>     rand 
#> "rand()" 
#> 
#> $group00002
#>        choice 
#> "rand >= 0.5" 
#> 
#> $group00003
#>                                          a_1 
#> "ifelse(choice, \"treatment\", \"control\")" 
#>                                          a_2 
#> "ifelse(choice, \"control\", \"treatment\")" 
#>                                         rand 
#>                                     "rand()" 
#> 
#> $group00004
#>        choice 
#> "rand >= 0.5" 
#> 
#> $group00005
#>                                          b_1 
#> "ifelse(choice, \"treatment\", \"control\")" 
#>                                          b_2 
#> "ifelse(choice, \"control\", \"treatment\")" 
#>                                         rand 
#>                                     "rand()" 
#> 
#> $group00006
#>        choice 
#> "rand >= 0.5" 
#> 
#> $group00007
#>                                          c_1 
#> "ifelse(choice, \"treatment\", \"control\")" 
#>                                          c_2 
#> "ifelse(choice, \"control\", \"treatment\")" 
#>                                         rand 
#>                                     "rand()" 
#> 
#> $group00008
#>        choice 
#> "rand >= 0.5" 
#> 
#> $group00009
#>                                          d_1 
#> "ifelse(choice, \"treatment\", \"control\")" 
#>                                          d_2 
#> "ifelse(choice, \"control\", \"treatment\")" 
#>                                         rand 
#>                                     "rand()" 
#> 
#> $group00010
#>        choice 
#> "rand >= 0.5" 
#> 
#> $group00011
#>                                          e_1 
#> "ifelse(choice, \"treatment\", \"control\")" 
#>                                          e_2 
#> "ifelse(choice, \"control\", \"treatment\")"