# Use data.table the tidy way: An ultimate tutorial of tidyfst

I love the tidy syntax of dplyr and the ultimate speed of data.table. Why not take the both? That is why I have started the work of tidyfst, bridging the tidy syntax and computation performance via translation. This tool is especially friendly for dplyr users who want to learn some data.table, but data.table could also benefit from it (more or less).

A great comparison of data.table and dplyr was displayed at https://atrebas.github.io/post/2019-03-03-datatable-dplyr/ (thanks to Atrebas). I love this tutorial very much because it dig rather deep into many features from both packages. Here I’ll try to implement all operations from that tutorial, and the potential users could find why they would prefer tidyfst for some (if not most) tasks.

The below examples have all been checked with tidyfst. Now let’s begin.

## Create example data

library(tidyfst)

set.seed(1L)

## Create a data table
DF <- data.table(V1 = rep(c(1L, 2L), 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3))
copy(DF) -> DT

class(DF)
#> [1] "data.table" "data.frame"
DF
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     1     3   1.5      C
#> 4:     2     4   0.5      A
#> 5:     1     5   1.0      B
#> 6:     2     6   1.5      C
#> 7:     1     7   0.5      A
#> 8:     2     8   1.0      B
#> 9:     1     9   1.5      C

# Basic operations

## Filter rows

### Filter rows using indices
slice_dt(DF, 3:4)
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     3   1.5      C
#> 2:     2     4   0.5      A

### Discard rows using negative indices
slice_dt(DF, -(3:7))
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     2     8   1.0      B
#> 4:     1     9   1.5      C

### Filter rows using a logical expression
filter_dt(DF, V2 > 5)
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     2     6   1.5      C
#> 2:     1     7   0.5      A
#> 3:     2     8   1.0      B
#> 4:     1     9   1.5      C
filter_dt(DF, V4 %in% c("A", "C"))
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     1     3   1.5      C
#> 3:     2     4   0.5      A
#> 4:     2     6   1.5      C
#> 5:     1     7   0.5      A
#> 6:     1     9   1.5      C
filter_dt(DF, V4 %chin% c("A", "C")) # fast %in% for character
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     1     3   1.5      C
#> 3:     2     4   0.5      A
#> 4:     2     6   1.5      C
#> 5:     1     7   0.5      A
#> 6:     1     9   1.5      C

### Filter rows using multiple conditions
filter_dt(DF, V1 == 1, V4 == "A")
#> [1]  TRUE FALSE FALSE  TRUE FALSE
# equals to
filter_dt(DF, V1 == 1 & V4 == "A")
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     1     7   0.5      A

### Filter unique rows
distinct_dt(DF) # unique(DF)
#> Indices: <V4>, <V1>, <V4__V1>
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     1     3   1.5      C
#> 4:     2     4   0.5      A
#> 5:     1     5   1.0      B
#> 6:     2     6   1.5      C
#> 7:     1     7   0.5      A
#> 8:     2     8   1.0      B
#> 9:     1     9   1.5      C
distinct_dt(DF, V1,V4)
#>       V1     V4
#>    <int> <char>
#> 1:     1      A
#> 2:     2      B
#> 3:     1      C
#> 4:     2      A
#> 5:     1      B
#> 6:     2      C

### Discard rows with missing values
drop_na_dt(DF) # na.omit(DF)
#> Indices: <V4>, <V1>, <V4__V1>
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     1     3   1.5      C
#> 4:     2     4   0.5      A
#> 5:     1     5   1.0      B
#> 6:     2     6   1.5      C
#> 7:     1     7   0.5      A
#> 8:     2     8   1.0      B
#> 9:     1     9   1.5      C

### Other filters
sample_n_dt(DF, 3)      # n random rows
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     9   1.5      C
#> 2:     2     4   0.5      A
#> 3:     1     7   0.5      A
sample_frac_dt(DF, 0.5) # fraction of random rows
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     1     5   1.0      B
#> 4:     1     3   1.5      C
slice_max_dt(DF, V1,1)     # top n entries (includes equals)
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     2     2   1.0      B
#> 2:     2     4   0.5      A
#> 3:     2     6   1.5      C
#> 4:     2     8   1.0      B

filter_dt(DT,V4 %like% "^B")
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     2     2     1      B
#> 2:     1     5     1      B
#> 3:     2     8     1      B
filter_dt(DT,V2 %between% c(3, 5))
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     3   1.5      C
#> 2:     2     4   0.5      A
#> 3:     1     5   1.0      B
filter_dt(DT,between(V2, 3, 5, incbounds = FALSE))
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     2     4   0.5      A
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     1     3   1.5      C

## Sort rows

### Sort rows by column
arrange_dt(DF, V3)
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     4   0.5      A
#> 3:     1     7   0.5      A
#> 4:     2     2   1.0      B
#> 5:     1     5   1.0      B
#> 6:     2     8   1.0      B
#> 7:     1     3   1.5      C
#> 8:     2     6   1.5      C
#> 9:     1     9   1.5      C

### Sort rows in decreasing order
arrange_dt(DF, -V3)
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     3   1.5      C
#> 2:     2     6   1.5      C
#> 3:     1     9   1.5      C
#> 4:     2     2   1.0      B
#> 5:     1     5   1.0      B
#> 6:     2     8   1.0      B
#> 7:     1     1   0.5      A
#> 8:     2     4   0.5      A
#> 9:     1     7   0.5      A

### Sort rows based on several columns
arrange_dt(DF, V1, -V2)
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     9   1.5      C
#> 2:     1     7   0.5      A
#> 3:     1     5   1.0      B
#> 4:     1     3   1.5      C
#> 5:     1     1   0.5      A
#> 6:     2     8   1.0      B
#> 7:     2     6   1.5      C
#> 8:     2     4   0.5      A
#> 9:     2     2   1.0      B

## Select columns

### Select one column using an index (not recommended)
pull_dt(DT,3) # returns a vector
#> [1] 0.5 1.0 1.5 0.5 1.0 1.5 0.5 1.0 1.5
select_dt(DT,3) # returns a data.table
#>       V3
#>    <num>
#> 1:   0.5
#> 2:   1.0
#> 3:   1.5
#> 4:   0.5
#> 5:   1.0
#> 6:   1.5
#> 7:   0.5
#> 8:   1.0
#> 9:   1.5

### Select one column using column name
select_dt(DF, V2) # returns a data.table
#>       V2
#>    <int>
#> 1:     1
#> 2:     2
#> 3:     3
#> 4:     4
#> 5:     5
#> 6:     6
#> 7:     7
#> 8:     8
#> 9:     9
pull_dt(DF, V2)   # returns a vector
#> [1] 1 2 3 4 5 6 7 8 9

### Select several columns
select_dt(DF, V2, V3, V4)
#>       V2    V3     V4
#>    <int> <num> <char>
#> 1:     1   0.5      A
#> 2:     2   1.0      B
#> 3:     3   1.5      C
#> 4:     4   0.5      A
#> 5:     5   1.0      B
#> 6:     6   1.5      C
#> 7:     7   0.5      A
#> 8:     8   1.0      B
#> 9:     9   1.5      C
select_dt(DF, V2:V4) # select columns between V2 and V4
#>       V2    V3     V4
#>    <int> <num> <char>
#> 1:     1   0.5      A
#> 2:     2   1.0      B
#> 3:     3   1.5      C
#> 4:     4   0.5      A
#> 5:     5   1.0      B
#> 6:     6   1.5      C
#> 7:     7   0.5      A
#> 8:     8   1.0      B
#> 9:     9   1.5      C

### Exclude columns
select_dt(DF, -V2, -V3)
#> Indices: <V4>, <V1>, <V4__V1>
#>       V1     V4
#>    <int> <char>
#> 1:     1      A
#> 2:     2      B
#> 3:     1      C
#> 4:     2      A
#> 5:     1      B
#> 6:     2      C
#> 7:     1      A
#> 8:     2      B
#> 9:     1      C

### Select/Exclude columns using a character vector
cols <- c("V2", "V3")
select_dt(DF,cols = cols)
#>       V2    V3
#>    <int> <num>
#> 1:     1   0.5
#> 2:     2   1.0
#> 3:     3   1.5
#> 4:     4   0.5
#> 5:     5   1.0
#> 6:     6   1.5
#> 7:     7   0.5
#> 8:     8   1.0
#> 9:     9   1.5
select_dt(DF,cols = cols,negate = TRUE)
#> Indices: <V4>, <V1>, <V4__V1>
#>       V1     V4
#>    <int> <char>
#> 1:     1      A
#> 2:     2      B
#> 3:     1      C
#> 4:     2      A
#> 5:     1      B
#> 6:     2      C
#> 7:     1      A
#> 8:     2      B
#> 9:     1      C

### Other selections
select_dt(DF, cols = paste0("V", 1:2))
#> Index: <V1>
#>       V1    V2
#>    <int> <int>
#> 1:     1     1
#> 2:     2     2
#> 3:     1     3
#> 4:     2     4
#> 5:     1     5
#> 6:     2     6
#> 7:     1     7
#> 8:     2     8
#> 9:     1     9
relocate_dt(DF, V4) # reorder columns
#> Indices: <V4>, <V1>, <V4__V1>
#>        V4    V1    V2    V3
#>    <char> <int> <int> <num>
#> 1:      A     1     1   0.5
#> 2:      B     2     2   1.0
#> 3:      C     1     3   1.5
#> 4:      A     2     4   0.5
#> 5:      B     1     5   1.0
#> 6:      C     2     6   1.5
#> 7:      A     1     7   0.5
#> 8:      B     2     8   1.0
#> 9:      C     1     9   1.5
select_dt(DF, "V")
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     2   1.0      B
#> 3:     1     3   1.5      C
#> 4:     2     4   0.5      A
#> 5:     1     5   1.0      B
#> 6:     2     6   1.5      C
#> 7:     1     7   0.5      A
#> 8:     2     8   1.0      B
#> 9:     1     9   1.5      C
select_dt(DF, "3$") #> V3 #> <num> #> 1: 0.5 #> 2: 1.0 #> 3: 1.5 #> 4: 0.5 #> 5: 1.0 #> 6: 1.5 #> 7: 0.5 #> 8: 1.0 #> 9: 1.5 select_dt(DF, ".2") #> V2 #> <int> #> 1: 1 #> 2: 2 #> 3: 3 #> 4: 4 #> 5: 5 #> 6: 6 #> 7: 7 #> 8: 8 #> 9: 9 select_dt(DF, "V1") #> V1 #> <int> #> 1: 1 #> 2: 2 #> 3: 1 #> 4: 2 #> 5: 1 #> 6: 2 #> 7: 1 #> 8: 2 #> 9: 1 select_dt(DF, -"^V2") #> V1 V3 V4 #> <int> <num> <char> #> 1: 1 0.5 A #> 2: 2 1.0 B #> 3: 1 1.5 C #> 4: 2 0.5 A #> 5: 1 1.0 B #> 6: 2 1.5 C #> 7: 1 0.5 A #> 8: 2 1.0 B #> 9: 1 1.5 C # remove variables using "-" prior to function ## Summarise data ### Summarise one column summarise_dt(DF, sum(V1)) # returns a data.table #> V1 #> <int> #> 1: 13 summarise_dt(DF, sumV1 = sum(V1)) # returns a data.table #> sumV1 #> <int> #> 1: 13 ### Summarise several columns summarise_dt(DF, sum(V1), sd(V3)) #> V1 V2 #> <int> <num> #> 1: 13 0.4330127 ### Summarise several columns and assign column names DF %>% summarise_dt(sumv1 = sum(V1), sdv3 = sd(V3)) #> sumv1 sdv3 #> <int> <num> #> 1: 13 0.4330127 ### Summarise a subset of rows DT[1:4, sum(V1)] #> [1] 6 DF %>% slice_dt(1:4) %>% summarise_dt(sum(V1)) #> V1 #> <int> #> 1: 6 ### Misc summarise_dt(DF, nth(V3,1)) #> V1 #> <num> #> 1: 0.5 summarise_dt(DF, nth(V3,-1)) #> V1 #> <num> #> 1: 1.5 summarise_dt(DF, nth(V3, 5)) #> V1 #> <num> #> 1: 1 summarise_dt(DF, uniqueN(V4)) #> V1 #> <int> #> 1: 3 uniqueN(DF) #> [1] 9 ## group computation (by) ### By group # not recommended DF %>% group_dt( by = V4, summarise_dt(sumV2 = sum(V2)) ) #> V4 sumV2 #> <char> <int> #> 1: A 12 #> 2: B 15 #> 3: C 18 # recommended DF %>% summarise_dt(sumV2 = sum(V2),by = V4) #> V4 sumV2 #> <char> <int> #> 1: A 12 #> 2: B 15 #> 3: C 18 ### By several groups DF %>% summarise_dt(sumV2 = sum(V2),by = .(V1,V4)) #> V1 V4 sumV2 #> <int> <char> <int> #> 1: 1 A 8 #> 2: 2 B 10 #> 3: 1 C 12 #> 4: 2 A 4 #> 5: 1 B 5 #> 6: 2 C 6 ### Calling function in by DF %>% summarise_dt(sumV2 = sum(V2),by = tolower(V4)) #> tolower sumV2 #> <char> <int> #> 1: a 12 #> 2: b 15 #> 3: c 18 ### Assigning column name in by DF %>% summarise_dt(sumV2 = sum(V2),by = .(abc = tolower(V4))) #> abc sumV2 #> <char> <int> #> 1: a 12 #> 2: b 15 #> 3: c 18 ### Using a condition in by DF %>% summarise_dt(sumV2 = sum(V2),by = V4 == "A") #> V4 sumV2 #> <lgcl> <int> #> 1: TRUE 12 #> 2: FALSE 33 ### By on a subset of rows DF %>% slice_dt(1:5) %>% summarise_dt(sumV1 = sum(V1),by = V4) #> V4 sumV1 #> <char> <int> #> 1: A 3 #> 2: B 3 #> 3: C 1 ### Count number of observations for each group count_dt(DF, V4) #> V4 n #> <char> <int> #> 1: A 3 #> 2: B 3 #> 3: C 3 ### Add a column with number of observations for each group add_count_dt(DF, V1) #> Indices: <V4>, <V1>, <V4__V1> #> V1 V2 V3 V4 n #> <int> <int> <num> <char> <int> #> 1: 1 1 0.5 A 5 #> 2: 2 2 1.0 B 4 #> 3: 1 3 1.5 C 5 #> 4: 2 4 0.5 A 4 #> 5: 1 5 1.0 B 5 #> 6: 2 6 1.5 C 4 #> 7: 1 7 0.5 A 5 #> 8: 2 8 1.0 B 4 #> 9: 1 9 1.5 C 5 ### Retrieve the first/last/nth observation for each group DF %>% summarise_dt(by = V4,nth(V2,1)) #> V4 V1 #> <char> <int> #> 1: A 1 #> 2: B 2 #> 3: C 3 DF %>% summarise_dt(by = V4,nth(V2,-1)) #> V4 V1 #> <char> <int> #> 1: A 7 #> 2: B 8 #> 3: C 9 DF %>% summarise_dt(by = V4,nth(V2,2)) #> V4 V1 #> <char> <int> #> 1: A 4 #> 2: B 5 #> 3: C 6 # Going further ## Advanced columns manipulation ### Summarise all the columns summarise_vars(DT,.func = max) #> V1 V2 V3 V4 #> <int> <int> <num> <char> #> 1: 2 9 1.5 C ### Summarise several columns summarise_vars(DT,c("V1", "V2"),mean) #> V1 V2 #> <num> <num> #> 1: 1.444444 5 ### Summarise several columns by group DT %>% summarise_vars(c("V1", "V2"),mean,by = V4) #> V4 V1 V2 #> <char> <num> <num> #> 1: A 1.333333 4 #> 2: B 1.666667 5 #> 3: C 1.333333 6 ## using patterns (regex) DT %>% summarise_vars("V1|V2",mean,by = V4) #> V4 V1 V2 #> <char> <num> <num> #> 1: A 1.333333 4 #> 2: B 1.666667 5 #> 3: C 1.333333 6 ## Summarise with more than one function by group # when you can't find a way, you can always use in_dt to use data.table DT %>% in_dt(, by = V4, c(lapply(.SD, sum), lapply(.SD, mean))) #> V4 V1 V2 V3 V1 V2 V3 #> <char> <int> <int> <num> <num> <num> <num> #> 1: A 4 12 1.5 1.333333 4 0.5 #> 2: B 5 15 3.0 1.666667 5 1.0 #> 3: C 4 18 4.5 1.333333 6 1.5 ### Summarise using a condition summarise_vars(DF, is.numeric, mean) #> V1 V2 V3 #> <num> <num> <num> #> 1: 1.444444 5 1 ### Modify all the columns mutate_vars(DF, .func = rev) #> V1 V2 V3 V4 #> <int> <int> <num> <char> #> 1: 1 9 1.5 C #> 2: 2 8 1.0 B #> 3: 1 7 0.5 A #> 4: 2 6 1.5 C #> 5: 1 5 1.0 B #> 6: 2 4 0.5 A #> 7: 1 3 1.5 C #> 8: 2 2 1.0 B #> 9: 1 1 0.5 A ### Modify several columns (dropping the others) DF %>% select_dt(cols = c("V1", "V2")) %>% mutate_vars(.func = sqrt) #> V1 V2 #> <num> <num> #> 1: 1.000000 1.000000 #> 2: 1.414214 1.414214 #> 3: 1.000000 1.732051 #> 4: 1.414214 2.000000 #> 5: 1.000000 2.236068 #> 6: 1.414214 2.449490 #> 7: 1.000000 2.645751 #> 8: 1.414214 2.828427 #> 9: 1.000000 3.000000 DF %>% select_dt(-V4) %>% mutate_vars(.func = exp) #> V1 V2 V3 #> <num> <num> <num> #> 1: 2.718282 2.718282 1.648721 #> 2: 7.389056 7.389056 2.718282 #> 3: 2.718282 20.085537 4.481689 #> 4: 7.389056 54.598150 1.648721 #> 5: 2.718282 148.413159 2.718282 #> 6: 7.389056 403.428793 4.481689 #> 7: 2.718282 1096.633158 1.648721 #> 8: 7.389056 2980.957987 2.718282 #> 9: 2.718282 8103.083928 4.481689 ### Modify several columns (keeping the others) DF %>% mutate_vars(c("V1", "V2"), sqrt) #> Index: <V4> #> V1 V2 V3 V4 #> <num> <num> <num> <char> #> 1: 1.000000 1.000000 0.5 A #> 2: 1.414214 1.414214 1.0 B #> 3: 1.000000 1.732051 1.5 C #> 4: 1.414214 2.000000 0.5 A #> 5: 1.000000 2.236068 1.0 B #> 6: 1.414214 2.449490 1.5 C #> 7: 1.000000 2.645751 0.5 A #> 8: 1.414214 2.828427 1.0 B #> 9: 1.000000 3.000000 1.5 C DF %>% mutate_vars(-"V4", exp) #> Index: <V4> #> V1 V2 V3 V4 #> <num> <num> <num> <char> #> 1: 2.718282 2.718282 1.648721 A #> 2: 7.389056 7.389056 2.718282 B #> 3: 2.718282 20.085537 4.481689 C #> 4: 7.389056 54.598150 1.648721 A #> 5: 2.718282 148.413159 2.718282 B #> 6: 7.389056 403.428793 4.481689 C #> 7: 2.718282 1096.633158 1.648721 A #> 8: 7.389056 2980.957987 2.718282 B #> 9: 2.718282 8103.083928 4.481689 C ### Modify columns using a condition (dropping the others) select_dt(DT,is.numeric) #> V1 V2 V3 #> <int> <int> <num> #> 1: 1 1 0.5 #> 2: 2 2 1.0 #> 3: 1 3 1.5 #> 4: 2 4 0.5 #> 5: 1 5 1.0 #> 6: 2 6 1.5 #> 7: 1 7 0.5 #> 8: 2 8 1.0 #> 9: 1 9 1.5 ### Modify columns using a condition (keeping the others) mutate_vars(DT,is.numeric,as.integer) #> V1 V2 V3 V4 #> <int> <int> <int> <char> #> 1: 1 1 0 A #> 2: 2 2 1 B #> 3: 1 3 1 C #> 4: 2 4 0 A #> 5: 1 5 1 B #> 6: 2 6 1 C #> 7: 1 7 0 A #> 8: 2 8 1 B #> 9: 1 9 1 C ### Use a complex expression DF %>% group_dt( by = V4, slice_dt(1:2) %>% transmute_dt(V1 = V1, V2 = "X") ) #> V4 V1 V2 #> <char> <int> <char> #> 1: A 1 X #> 2: A 2 X #> 3: B 2 X #> 4: B 1 X #> 5: C 1 X #> 6: C 2 X ### Use multiple expressions (with DT[,{j}]) DT %>% in_dt(,{ print(V1) # comments here! print(summary(V1)) x <- V1 + sum(V2) .(A = 1:.N, B = x) # last list returned as a data.table } ) #> [1] 1 2 1 2 1 2 1 2 1 #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 1.000 1.000 1.000 1.444 2.000 2.000 #> A B #> <int> <int> #> 1: 1 46 #> 2: 2 47 #> 3: 3 46 #> 4: 4 47 #> 5: 5 46 #> 6: 6 47 #> 7: 7 46 #> 8: 8 47 #> 9: 9 46 ## Advanced use of by ### Select first/last/… row by group DT %>% group_dt( by = V4, head(1) ) #> V4 V1 V2 V3 #> <char> <int> <int> <num> #> 1: A 1 1 0.5 #> 2: B 2 2 1.0 #> 3: C 1 3 1.5 DT %>% group_dt( by = V4, tail(2) ) #> V4 V1 V2 V3 #> <char> <int> <int> <num> #> 1: A 2 4 0.5 #> 2: A 1 7 0.5 #> 3: B 1 5 1.0 #> 4: B 2 8 1.0 #> 5: C 2 6 1.5 #> 6: C 1 9 1.5 DT %>% group_dt( by = V4, slice_dt(1,.N) ) #> V4 V1 V2 V3 #> <char> <int> <int> <num> #> 1: A 1 1 0.5 #> 2: A 1 7 0.5 #> 3: B 2 2 1.0 #> 4: B 2 8 1.0 #> 5: C 1 3 1.5 #> 6: C 1 9 1.5 ### Select rows using a nested query DF %>% group_dt( by = V4, arrange_dt(V2) %>% slice_dt(1) ) #> V4 V1 V2 V3 #> <char> <int> <int> <num> #> 1: A 1 1 0.5 #> 2: B 2 2 1.0 #> 3: C 1 3 1.5 ### Add a group counter column DT %>% mutate_dt(Grp = .GRP,by = .(V4, V1)) #> V1 V2 V3 V4 Grp #> <int> <int> <num> <char> <int> #> 1: 1 1 0.5 A 1 #> 2: 2 2 1.0 B 2 #> 3: 1 3 1.5 C 3 #> 4: 2 4 0.5 A 4 #> 5: 1 5 1.0 B 5 #> 6: 2 6 1.5 C 6 #> 7: 1 7 0.5 A 1 #> 8: 2 8 1.0 B 2 #> 9: 1 9 1.5 C 3 ### Get row number of first (and last) observation by group DT %>% summarise_dt(I = .I,by = V4) #> V4 I #> <char> <int> #> 1: A 1 #> 2: A 4 #> 3: A 7 #> 4: B 2 #> 5: B 5 #> 6: B 8 #> 7: C 3 #> 8: C 6 #> 9: C 9 DT %>% summarise_dt(I = .I[1],by = V4) #> V4 I #> <char> <int> #> 1: A 1 #> 2: B 2 #> 3: C 3 DT %>% summarise_dt(I = .I[c(1,.N)],by = V4) #> V4 I #> <char> <int> #> 1: A 1 #> 2: A 7 #> 3: B 2 #> 4: B 8 #> 5: C 3 #> 6: C 9 ### Handle list-columns by group DT %>% select_dt(V1,V4) %>% chop_dt(V1) # return V1 as a list #> V4 V1 #> <char> <list> #> 1: A 1,2,1 #> 2: B 2,1,2 #> 3: C 1,2,1 DT %>% nest_dt(V4) # subsets of the data #> V4 ndt #> <char> <list> #> 1: A <data.table[3x3]> #> 2: B <data.table[3x3]> #> 3: C <data.table[3x3]> ### Grouping sets (multiple by at once) # use data.table directly, tidyfst does not provide new methods for it yet data.table::rollup(DT, .(SumV2 = sum(V2)), by = c("V1", "V4")) #> V1 V4 SumV2 #> <int> <char> <int> #> 1: 1 A 8 #> 2: 2 B 10 #> 3: 1 C 12 #> 4: 2 A 4 #> 5: 1 B 5 #> 6: 2 C 6 #> 7: 1 <NA> 25 #> 8: 2 <NA> 20 #> 9: NA <NA> 45 data.table::rollup(DT, .(SumV2 = sum(V2), .N), by = c("V1", "V4"), id = TRUE) #> grouping V1 V4 SumV2 N #> <int> <int> <char> <int> <int> #> 1: 0 1 A 8 2 #> 2: 0 2 B 10 2 #> 3: 0 1 C 12 2 #> 4: 0 2 A 4 1 #> 5: 0 1 B 5 1 #> 6: 0 2 C 6 1 #> 7: 1 1 <NA> 25 5 #> 8: 1 2 <NA> 20 4 #> 9: 3 NA <NA> 45 9 data.table::cube(DT, .(SumV2 = sum(V2), .N), by = c("V1", "V4"), id = TRUE) #> grouping V1 V4 SumV2 N #> <int> <int> <char> <int> <int> #> 1: 0 1 A 8 2 #> 2: 0 2 B 10 2 #> 3: 0 1 C 12 2 #> 4: 0 2 A 4 1 #> 5: 0 1 B 5 1 #> 6: 0 2 C 6 1 #> 7: 1 1 <NA> 25 5 #> 8: 1 2 <NA> 20 4 #> 9: 2 NA A 12 3 #> 10: 2 NA B 15 3 #> 11: 2 NA C 18 3 #> 12: 3 NA <NA> 45 9 data.table::groupingsets(DT, .(SumV2 = sum(V2), .N), by = c("V1", "V4"), sets = list("V1", c("V1", "V4")), id = TRUE) #> grouping V1 V4 SumV2 N #> <int> <int> <char> <int> <int> #> 1: 1 1 <NA> 25 5 #> 2: 1 2 <NA> 20 4 #> 3: 0 1 A 8 2 #> 4: 0 2 B 10 2 #> 5: 0 1 C 12 2 #> 6: 0 2 A 4 1 #> 7: 0 1 B 5 1 #> 8: 0 2 C 6 1 # Miscellaneous ## Read / Write data tidyfst exports data.table::fread and data.table::fwrite directly. ### Write data to a csv file fwrite(DT, "DT.csv") ### Write data to a tab-delimited file fwrite(DT, "DT.txt", sep = "\t") ### Write list-column data to a csv file fwrite(setDT(list(0, list(1:5))), "DT2.csv") # ### Read a csv / tab-delimited file fread("DT.csv") # fread("DT.csv", verbose = TRUE) # full details fread("DT.txt", sep = "\t") ### Read a csv file selecting / droping columns fread("DT.csv", select = c("V1", "V4")) fread("DT.csv", drop = "V4") # NA ### Read and rbind several files rbindlist(lapply(c("DT.csv", "DT.csv"), fread)) # c("DT.csv", "DT.csv") %>% lapply(fread) %>% rbindlist ## Reshape data ### Melt data (from wide to long) fsetequal(DT,DF) #> [1] TRUE mDT = DT %>% longer_dt(V3,V4) mDF = DF %>% longer_dt(-"V1|V2") fsetequal(mDT,mDF) #> [1] TRUE mDT #> V3 V4 name value #> <num> <char> <fctr> <int> #> 1: 0.5 A V1 1 #> 2: 1.0 B V1 2 #> 3: 1.5 C V1 1 #> 4: 0.5 A V1 2 #> 5: 1.0 B V1 1 #> 6: 1.5 C V1 2 #> 7: 0.5 A V1 1 #> 8: 1.0 B V1 2 #> 9: 1.5 C V1 1 #> 10: 0.5 A V2 1 #> 11: 1.0 B V2 2 #> 12: 1.5 C V2 3 #> 13: 0.5 A V2 4 #> 14: 1.0 B V2 5 #> 15: 1.5 C V2 6 #> 16: 0.5 A V2 7 #> 17: 1.0 B V2 8 #> 18: 1.5 C V2 9 ### Cast data (from long to wide) mDT %>% wider_dt(V4,name = "name",value = "value") #> Aggregate function missing, defaulting to 'length' #> Key: <V4> #> V4 V1 V2 #> <char> <int> <int> #> 1: A 3 3 #> 2: B 3 3 #> 3: C 3 3 # below is a special case and could only be done in tidyfst mDT %>% wider_dt(V4,name = "name",value = "value",fun = list) #> V4 V1 V2 #> <char> <int> <int> #> 1: A 1 1 #> 2: A 2 4 #> 3: A 1 7 #> 4: B 2 2 #> 5: B 1 5 #> 6: B 2 8 #> 7: C 1 3 #> 8: C 2 6 #> 9: C 1 9 mDT %>% wider_dt(V4,name = "name",value = "value",fun = sum) #> Key: <V4> #> V4 V1 V2 #> <char> <int> <int> #> 1: A 4 12 #> 2: B 5 15 #> 3: C 4 18 ### Split split(DT, by = "V4") #>$A
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     1   0.5      A
#> 2:     2     4   0.5      A
#> 3:     1     7   0.5      A
#>
#> $B #> V1 V2 V3 V4 #> <int> <int> <num> <char> #> 1: 2 2 1 B #> 2: 1 5 1 B #> 3: 2 8 1 B #> #>$C
#>       V1    V2    V3     V4
#>    <int> <int> <num> <char>
#> 1:     1     3   1.5      C
#> 2:     2     6   1.5      C
#> 3:     1     9   1.5      C

## Other

### Lead/Lag
lag_dt(1:10,n = 1)
#>  [1] NA  1  2  3  4  5  6  7  8  9
lag_dt(1:10,n = 1:2)
#> [[1]]
#>  [1] NA  1  2  3  4  5  6  7  8  9
#>
#> [[2]]
#>  [1] NA NA  1  2  3  4  5  6  7  8
#>  [1]  2  3  4  5  6  7  8  9 10 NA

# Join/Bind data sets

## Join

x <- data.table(Id  = c("A", "B", "C", "C"),
X1  = c(1L, 3L, 5L, 7L),
XY  = c("x2", "x4", "x6", "x8"),
key = "Id")

y <- data.table(Id  = c("A", "B", "B", "D"),
Y1  = c(1L, 3L, 5L, 7L),
XY  = c("y1", "y3", "y5", "y7"),
key = "Id")

### left join
left_join_dt(x, y, by = "Id")
#> Key: <Id>
#>        Id    X1   XY.x    Y1   XY.y
#>    <char> <int> <char> <int> <char>
#> 1:      A     1     x2     1     y1
#> 2:      B     3     x4     3     y3
#> 3:      B     3     x4     5     y5
#> 4:      C     5     x6    NA   <NA>
#> 5:      C     7     x8    NA   <NA>

### right join
right_join_dt(x, y, by = "Id")
#> Key: <Id>
#>        Id    X1   XY.x    Y1   XY.y
#>    <char> <int> <char> <int> <char>
#> 1:      A     1     x2     1     y1
#> 2:      B     3     x4     3     y3
#> 3:      B     3     x4     5     y5
#> 4:      D    NA   <NA>     7     y7

### inner join
inner_join_dt(x, y, by = "Id")
#> Key: <Id>
#>        Id    X1   XY.x    Y1   XY.y
#>    <char> <int> <char> <int> <char>
#> 1:      A     1     x2     1     y1
#> 2:      B     3     x4     3     y3
#> 3:      B     3     x4     5     y5

### full join
full_join_dt(x, y, by = "Id")
#> Key: <Id>
#>        Id    X1   XY.x    Y1   XY.y
#>    <char> <int> <char> <int> <char>
#> 1:      A     1     x2     1     y1
#> 2:      B     3     x4     3     y3
#> 3:      B     3     x4     5     y5
#> 4:      C     5     x6    NA   <NA>
#> 5:      C     7     x8    NA   <NA>
#> 6:      D    NA   <NA>     7     y7

### semi join
semi_join_dt(x, y, by = "Id")
#> Key: <Id>
#>        Id    X1     XY
#>    <char> <int> <char>
#> 1:      A     1     x2
#> 2:      B     3     x4

### anti join
anti_join_dt(x, y, by = "Id")
#> Key: <Id>
#>        Id    X1     XY
#>    <char> <int> <char>
#> 1:      C     5     x6
#> 2:      C     7     x8

## Bind

x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
### Bind rows
rbind(x, y)
#>       V1
#>    <int>
#> 1:     1
#> 2:     2
#> 3:     3
#> 4:     4
#> 5:     5
#> 6:     6
rbind(x, z, fill = TRUE)
#>       V1    V2
#>    <int> <int>
#> 1:     1    NA
#> 2:     2    NA
#> 3:     3    NA
#> 4:     7     0
#> 5:     8     0
#> 6:     9     0

### Bind rows using a list
rbindlist(list(x, y), idcol = TRUE)
#>      .id    V1
#>    <int> <int>
#> 1:     1     1
#> 2:     1     2
#> 3:     1     3
#> 4:     2     4
#> 5:     2     5
#> 6:     2     6

### Bind columns
cbind(x, y)
#>       V1    V1
#>    <int> <int>
#> 1:     1     4
#> 2:     2     5
#> 3:     3     6

## Set operations

x <- data.table(c(1, 2, 2, 3, 3))
y <- data.table(c(2, 2, 3, 4, 4))
### Intersection
fintersect(x, y)
#>       V1
#>    <num>
#> 1:     2
#> 2:     3
fintersect(x, y, all = TRUE)
#>       V1
#>    <num>
#> 1:     2
#> 2:     2
#> 3:     3

### Difference
fsetdiff(x, y)
#>       V1
#>    <num>
#> 1:     1
fsetdiff(x, y, all = TRUE)
#>       V1
#>    <num>
#> 1:     1
#> 2:     3

### Union
funion(x, y)
#>       V1
#>    <num>
#> 1:     1
#> 2:     2
#> 3:     3
#> 4:     4
funion(x, y, all = TRUE)
#>        V1
#>     <num>
#>  1:     1
#>  2:     2
#>  3:     2
#>  4:     3
#>  5:     3
#>  6:     2
#>  7:     2
#>  8:     3
#>  9:     4
#> 10:     4

### Equality
fsetequal(x, x[order(-V1),])
#> [1] TRUE
all.equal(x, x) # S3 method
#> [1] TRUE
setequal(x, x[order(-V1),])
#> [1] TRUE

# Summary

To break all these codes through, tidyfst has improved bit by bit. If you are using tidyfst frequently, you’ll find that while it enjoys a tidy syntax, it is more like you are using data.table in another style. Compared with many other packages with similar goals, tidyfst sticks to many principles of data.table (and is more like data.table in many ways). However, the ultimate goal is still clear: providing users with state-of-the-art data manipulation tools with least pain. Therefore, keep it simple and make it fast. Enjoy tidyfst~