database best practices · Issue #46 · ropensci-books/drake (original) (raw)

Thanks so much for this reprex, Amanda. I think your example ran smoothly because make() does not actually attempt to reload the connection from the cache. Below, I force it to happen (note the "load 2 items: this_con, seeded" message).

library(drake) library(DBI) library(tidyverse) pkgconfig::set_config("drake::strings_in_dots" = "literals")

Funs

connect_to_db <- function() { dbConnect(RSQLite::SQLite(), "db") }

seed_db <- function(conn) { dbWriteTable(conn, "mtcars", mtcars, overwrite = TRUE) }

get_res <- function(conn) { dbGetQuery(conn, "SELECT * FROM mtcars") %>% as_tibble() }

update_db_records <- function(tbl, conn) { new <- tbl %>% map_dfr(log)

dbWriteTable(conn, "mtcars", new, append = FALSE, overwrite = TRUE ) }

test_is_original <- function() { testthat::expect_equal( mtcars %>% as_tibble(), get_res(con) ) }

test_is_log <- function() { testthat::expect_equal( mtcars %>% map_dfr(log), get_res(con) ) }

Create db and write mtcars table

con <- connect_to_db() seed_db(con) test_is_original()

Connection for updating is a target in the plan

plan <- drake_plan( this_con = connect_to_db(), seeded = get_res(this_con), processed_data = update_db_records(seeded, this_con) )

Do the planned database work.

make(plan, verbose = 4) #> cache /tmp/RtmpCzTQKd/reprex2c0718c63f5/.drake #> analyze environment #> analyze 8 imports: seed_db, test_is_original, update_db_records, test_is... #> analyze 3 targets: this_con, seeded, processed_data #> construct graph edges #> construct vertex attributes #> construct graph #> import get_res #> import update_db_records #> import RSQLite::SQLite #> import connect_to_db #> target this_con #> store this_con #> target seeded #> store seeded #> target processed_data #> store processed_data

Just remove process_data

so the next make() loads the connection from the cache.

clean(processed_data) make(plan, verbose = 4) #> cache /tmp/RtmpCzTQKd/reprex2c0718c63f5/.drake #> analyze environment #> analyze 8 imports: seed_db, test_is_original, update_db_records, test_is... #> import get_res #> import update_db_records #> import RSQLite::SQLite #> import connect_to_db #> skip this_con #> skip seeded #> load 2 items: this_con, seeded #> target processed_data #> fail processed_data #> Error: Target processed_data failed. Call diagnose(processed_data) for details. Error message: #> external pointer is not valid

Created on 2018-10-17 by the reprex package (v0.2.1)

Interestingly, this is not a problem if you use an in-memory cache.

library(drake) library(DBI) library(tidyverse) pkgconfig::set_config("drake::strings_in_dots" = "literals")

Funs

connect_to_db <- function() { dbConnect(RSQLite::SQLite(), "db") }

seed_db <- function(conn) { dbWriteTable(conn, "mtcars", mtcars, overwrite = TRUE) }

get_res <- function(conn) { dbGetQuery(conn, "SELECT * FROM mtcars") %>% as_tibble() }

update_db_records <- function(tbl, conn) { new <- tbl %>% map_dfr(log)

dbWriteTable(conn, "mtcars", new, append = FALSE, overwrite = TRUE ) }

test_is_original <- function() { testthat::expect_equal( mtcars %>% as_tibble(), get_res(con) ) }

test_is_log <- function() { testthat::expect_equal( mtcars %>% map_dfr(log), get_res(con) ) }

Create db and write mtcars table

con <- connect_to_db() seed_db(con) test_is_original()

Connection for updating is a target in the plan

plan <- drake_plan( this_con = connect_to_db(), seeded = get_res(this_con), processed_data = update_db_records(seeded, this_con) )

Do the planned database work.

cache <- storr::storr_environment() make(plan, verbose = 4, cache = cache) #> analyze environment #> analyze 9 imports: seed_db, test_is_original, update_db_records, cache, ... #> analyze 3 targets: this_con, seeded, processed_data #> construct graph edges #> construct vertex attributes #> construct graph #> import get_res #> import update_db_records #> import RSQLite::SQLite #> import connect_to_db #> target this_con #> store this_con #> target seeded #> store seeded #> target processed_data #> store processed_data

Just remove process_data

so the next make() loads the connection from the cache.

clean(processed_data, cache = cache) make(plan, verbose = 4, cache = cache) #> analyze environment #> analyze 9 imports: seed_db, test_is_original, update_db_records, cache, ... #> import get_res #> import update_db_records #> import RSQLite::SQLite #> import connect_to_db #> skip this_con #> skip seeded #> load 2 items: this_con, seeded #> target processed_data #> store processed_data

Created on 2018-10-17 by the reprex package (v0.2.1)