Title: | Easily Access and Maintain Time-Based Versioned Data (Slowly-Changing-Dimension) |
---|---|
Description: | A collection of functions that enable easy access and updating of a database of data over time. More specifically, the package facilitates type-2 history for data-warehouses and provides a number of Quality of life improvements for working on SQL databases with R. For reference see Ralph Kimball and Margy Ross (2013, ISBN 9781118530801). |
Authors: | Rasmus Skytte Randløv [aut, cre, rev]
|
Maintainer: | Rasmus Skytte Randløv <[email protected]> |
License: | GPL-3 |
Version: | 0.5.1.9000 |
Built: | 2025-02-27 13:30:02 UTC |
Source: | https://github.com/ssi-dk/scdb |
Close connection to the database
close_connection(conn)
close_connection(conn)
conn |
( |
dbDisconnect()
returns TRUE
, invisibly.
conn <- get_connection() close_connection(conn)
conn <- get_connection() close_connection(conn)
Create the indexes on table
create_index(conn, db_table, columns)
create_index(conn, db_table, columns)
conn |
( |
db_table |
( |
columns |
( |
NULL (called for side effects)
conn <- get_connection() mt <- dplyr::copy_to(conn, dplyr::distinct(mtcars, .data$mpg, .data$cyl), name = "mtcars") create_index(conn, mt, c("mpg", "cyl")) close_connection(conn)
conn <- get_connection() mt <- dplyr::copy_to(conn, dplyr::distinct(mtcars, .data$mpg, .data$cyl), name = "mtcars") create_index(conn, mt, c("mpg", "cyl")) close_connection(conn)
Create a table with the SCDB log structure if it does not exists
create_logs_if_missing(conn, log_table)
create_logs_if_missing(conn, log_table)
conn |
( |
log_table |
( |
Invisibly returns the generated (or existing) log table.
conn <- get_connection() log_table <- id("test.logs", conn = conn, allow_table_only = TRUE) create_logs_if_missing(conn, log_table) close_connection(conn)
conn <- get_connection() log_table <- id("test.logs", conn = conn, allow_table_only = TRUE) create_logs_if_missing(conn, log_table) close_connection(conn)
Create a historical table from input data
create_table(.data, conn = NULL, db_table, ...)
create_table(.data, conn = NULL, db_table, ...)
.data |
( |
conn |
( |
db_table |
( |
... |
Other arguments passed to |
Invisibly returns the table as it looks on the destination (or locally if conn
is NULL
).
conn <- get_connection() create_table(mtcars, conn = conn, db_table = "mtcars") close_connection(conn)
conn <- get_connection() create_table(mtcars, conn = conn, db_table = "mtcars") close_connection(conn)
This set of function adds a simple locking system to database tables.
lock_table()
adds a record in the schema.locks table with the current time and R-session process id.
unlock_table()
removes records in the schema.locks table with the target table and the R-session process id.
When locking a table, the function will check for existing locks on the table and produce an error a lock is held by a process which no longer exists. In this case, the lock needs to be removed manually by removing the record from the lock table. In addition, the error implies that a table may have partial updates that needs to be manually rolled back.
lock_table(conn, db_table, schema = NULL) unlock_table(conn, db_table, schema = NULL, pid = Sys.getpid())
lock_table(conn, db_table, schema = NULL) unlock_table(conn, db_table, schema = NULL, pid = Sys.getpid())
conn |
( |
db_table |
( |
schema |
( |
pid |
( |
lock_table()
returns the TRUE
(FALSE
) if the lock was (un)successfully added.
If a lock exists for a non-active process, an error is thrown.
unlock_table()
returns NULL
(called for side effects).
conn <- DBI::dbConnect(RSQLite::SQLite()) lock_table(conn, "test_table") # TRUE unlock_table(conn, "test_table") DBI::dbDisconnect(conn)
conn <- DBI::dbConnect(RSQLite::SQLite()) lock_table(conn, "test_table") # TRUE unlock_table(conn, "test_table") DBI::dbDisconnect(conn)
Determine the type of timestamps the database supports
db_timestamp(timestamp, conn = NULL)
db_timestamp(timestamp, conn = NULL)
timestamp |
( |
conn |
( |
The given timestamp converted to a SQL-backend dependent timestamp.
conn <- get_connection() db_timestamp(Sys.time(), conn) close_connection(conn)
conn <- get_connection() db_timestamp(Sys.time(), conn) close_connection(conn)
This function marks a table for deletion once the current function exits.
defer_db_cleanup(db_table)
defer_db_cleanup(db_table)
db_table |
( |
NULL (called for side effects)
conn <- get_connection() mt <- dplyr::copy_to(conn, mtcars) id_mt <- id(mt) defer_db_cleanup(mt) DBI::dbExistsTable(conn, id_mt) # TRUE withr::deferred_run() DBI::dbExistsTable(conn, id_mt) # FALSE close_connection(conn)
conn <- get_connection() mt <- dplyr::copy_to(conn, mtcars) id_mt <- id(mt) defer_db_cleanup(mt) DBI::dbExistsTable(conn, id_mt) # TRUE withr::deferred_run() DBI::dbExistsTable(conn, id_mt) # FALSE close_connection(conn)
Computes an checksum from columns
digest_to_checksum(.data, col = "checksum", exclude = NULL)
digest_to_checksum(.data, col = "checksum", exclude = NULL)
.data |
( |
col |
( |
exclude |
( |
In most cases, the md5 algorithm is used to compute the checksums. For Microsoft SQL Server, the SHA-256 algorithm is used.
.data with a checksum column added.
digest_to_checksum(mtcars)
digest_to_checksum(mtcars)
If filters
is NULL
, no filtering is done.
Otherwise, the .data
object is filtered via an inner_join()
using all columns of the filter:
inner_join(.data, filter, by = colnames(filter))
by
and na_by
can overwrite the inner_join()
columns used in the filtering.
filter_keys(.data, filters, by = NULL, na_by = NULL, ...)
filter_keys(.data, filters, by = NULL, na_by = NULL, ...)
.data |
( |
filters |
( |
by |
A join specification created with If To join on different variables between To join by multiple variables, use a
For simple equality joins, you can alternatively specify a character vector
of variable names to join by. For example, To perform a cross-join, generating all combinations of |
na_by |
( |
... |
Further arguments passed to |
An object of same class as .data
# Filtering with null means no filtering is done filter <- NULL identical(filter_keys(mtcars, filter), mtcars) # TRUE # Filtering by vs = 0 filter <- data.frame(vs = 0) identical(filter_keys(mtcars, filter), dplyr::filter(mtcars, vs == 0)) # TRUE # Filtering by the specific combinations of vs = 0 and am = 1 filter <- dplyr::distinct(mtcars, vs, am) filter_keys(mtcars, filter)
# Filtering with null means no filtering is done filter <- NULL identical(filter_keys(mtcars, filter), mtcars) # TRUE # Filtering by vs = 0 filter <- data.frame(vs = 0) identical(filter_keys(mtcars, filter), dplyr::filter(mtcars, vs == 0)) # TRUE # Filtering by the specific combinations of vs = 0 and am = 1 filter <- dplyr::distinct(mtcars, vs, am) filter_keys(mtcars, filter)
Get the current schema/catalog of a database-related objects
get_catalog(obj, ...) ## S3 method for class ''Microsoft SQL Server'' get_catalog(obj, temporary = FALSE, ...) get_schema(obj, ...) ## S3 method for class 'PqConnection' get_schema(obj, temporary = FALSE, ...) ## S3 method for class 'SQLiteConnection' get_schema(obj, temporary = FALSE, ...)
get_catalog(obj, ...) ## S3 method for class ''Microsoft SQL Server'' get_catalog(obj, temporary = FALSE, ...) get_schema(obj, ...) ## S3 method for class 'PqConnection' get_schema(obj, temporary = FALSE, ...) ## S3 method for class 'SQLiteConnection' get_schema(obj, temporary = FALSE, ...)
obj |
( |
... |
Further arguments passed to methods. |
temporary |
( |
The catalog is extracted from obj
depending on the type of input:
For get_catalog.Microsoft SQL Server
, the current database context of the connection or "tempdb" if
temporary = TRUE
.
For get_schema.tbl_dbi
the catalog is determined via id()
.
For get_catalog.\\*
, NULL
is returned.
The schema is extracted from obj
depending on the type of input:
For get_schema.DBIConnection()
, the current schema of the connection if temporary = FALSE
.
See "Default schema" for more.
If temporary = TRUE
, the temporary schema of the connection is returned.
For get_schema.tbl_dbi()
the schema is determined via id()
.
For get_schema.Id()
, the schema is extracted from the Id
specification.
In some backends, it is possible to modify settings so that when a schema is not explicitly stated in a query,
the backend searches for the table in this schema by default.
For Postgres databases, this can be shown with SELECT CURRENT_SCHEMA()
(defaults to public
) and modified with
SET search_path TO { schema }
.
For SQLite databases, a temp
schema for temporary tables always exists as well as a main
schema for permanent
tables. Additional databases may be attached to the connection with a named schema, but as the attachment must be
made after the connection is established, get_schema
will never return any of these, as the default schema will
always be main
.
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) get_schema(conn) get_schema(get_table(conn, id("mtcars", conn = conn))) get_catalog(conn) get_catalog(get_table(conn, id("mtcars", conn = conn))) close_connection(conn)
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) get_schema(conn) get_schema(get_table(conn, id("mtcars", conn = conn))) get_catalog(conn) get_catalog(get_table(conn, id("mtcars", conn = conn))) close_connection(conn)
This is a convenience wrapper for DBI::dbConnect() for different database backends.
Connects to the specified dbname of host:port using user and password from given arguments (if applicable). Certain drivers may use credentials stored in a file, such as ~/.pgpass (PostgreSQL).
get_connection(drv, ...) ## S3 method for class 'SQLiteDriver' get_connection( drv, dbname = ":memory:", ..., bigint = c("integer", "bigint64", "numeric", "character") ) ## S3 method for class 'PqDriver' get_connection( drv, dbname = NULL, host = NULL, port = NULL, password = NULL, user = NULL, ..., bigint = c("integer", "bigint64", "numeric", "character"), check_interrupts = TRUE, timezone = Sys.timezone(), timezone_out = Sys.timezone() ) ## S3 method for class 'OdbcDriver' get_connection( drv, dsn = NULL, ..., bigint = c("integer", "bigint64", "numeric", "character"), timezone = Sys.timezone(), timezone_out = Sys.timezone() ) ## S3 method for class 'duckdb_driver' get_connection( drv, dbdir = ":memory:", ..., bigint = c("numeric", "character"), timezone_out = Sys.timezone() ) ## Default S3 method: get_connection(drv, ...)
get_connection(drv, ...) ## S3 method for class 'SQLiteDriver' get_connection( drv, dbname = ":memory:", ..., bigint = c("integer", "bigint64", "numeric", "character") ) ## S3 method for class 'PqDriver' get_connection( drv, dbname = NULL, host = NULL, port = NULL, password = NULL, user = NULL, ..., bigint = c("integer", "bigint64", "numeric", "character"), check_interrupts = TRUE, timezone = Sys.timezone(), timezone_out = Sys.timezone() ) ## S3 method for class 'OdbcDriver' get_connection( drv, dsn = NULL, ..., bigint = c("integer", "bigint64", "numeric", "character"), timezone = Sys.timezone(), timezone_out = Sys.timezone() ) ## S3 method for class 'duckdb_driver' get_connection( drv, dbdir = ":memory:", ..., bigint = c("numeric", "character"), timezone_out = Sys.timezone() ) ## Default S3 method: get_connection(drv, ...)
drv |
( |
... |
Additional parameters sent to DBI::dbConnect(). |
dbname |
( |
bigint |
( |
host |
( |
port |
( |
password |
( |
user |
( |
check_interrupts |
( |
timezone |
( |
timezone_out |
( |
dsn |
( |
dbdir |
( |
An object that inherits from DBIConnection
driver specified in drv
.
conn <- get_connection(drv = RSQLite::SQLite(), dbname = ":memory:") DBI::dbIsValid(conn) # TRUE close_connection(conn) DBI::dbIsValid(conn) # FALSE
conn <- get_connection(drv = RSQLite::SQLite(), dbname = ":memory:") DBI::dbIsValid(conn) # TRUE close_connection(conn) DBI::dbIsValid(conn) # FALSE
Retrieves a named table from a given schema on the connection
get_table(conn, db_table = NULL, slice_ts = NA, include_slice_info = FALSE)
get_table(conn, db_table = NULL, slice_ts = NA, include_slice_info = FALSE)
conn |
( |
db_table |
( |
slice_ts |
( |
include_slice_info |
( |
A "lazy" data.frame (tbl_lazy) generated using dbplyr.
Note that a temporary table will be preferred over ordinary tables in the default schema (see get_schema()
) with
an identical name.
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) get_table(conn) if (table_exists(conn, "mtcars")) { get_table(conn, "mtcars") } close_connection(conn)
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) get_table(conn) if (table_exists(conn, "mtcars")) { get_table(conn, "mtcars") } close_connection(conn)
List the available tables on the connection
get_tables(conn, pattern = NULL, show_temporary = TRUE)
get_tables(conn, pattern = NULL, show_temporary = TRUE)
conn |
( |
pattern |
( |
show_temporary |
( |
A data.frame containing table names including schema (and catalog when available) in the database.
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "my_test_table_1", temporary = FALSE) dplyr::copy_to(conn, mtcars, name = "my_test_table_2") get_tables(conn, pattern = "my_[th]est") get_tables(conn, pattern = "my_[th]est", show_temporary = FALSE) close_connection(conn)
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "my_test_table_1", temporary = FALSE) dplyr::copy_to(conn, mtcars, name = "my_test_table_2") get_tables(conn, pattern = "my_[th]est") get_tables(conn, pattern = "my_[th]est", show_temporary = FALSE) close_connection(conn)
Convenience function for DBI::Id
id(db_table, ...) ## S3 method for class 'Id' id(db_table, conn = NULL, ...) ## S3 method for class 'character' id(db_table, conn = NULL, allow_table_only = TRUE, ...) ## S3 method for class 'data.frame' id(db_table, ...)
id(db_table, ...) ## S3 method for class 'Id' id(db_table, conn = NULL, ...) ## S3 method for class 'character' id(db_table, conn = NULL, allow_table_only = TRUE, ...) ## S3 method for class 'data.frame' id(db_table, ...)
db_table |
( |
... |
Further arguments passed to methods. |
conn |
( |
allow_table_only |
( |
The given db_table
is parsed to a DBI::Id depending on the type of input:
character
: db_table is parsed to a DBI::Id object using an assumption of "schema.table" syntax
with corresponding schema (if found in conn
) and table values.
If no schema is implied, the default schema of conn
will be used.
DBI::Id
: if schema is not specified in Id
, the schema is set to the default schema for conn
(if given).
tbl_sql
: the remote name is used to resolve the table identification.
data.frame
: A Id is built from the data.frame (columns catalog
, schema
, and table
).
Can be used in conjunction with get_tables(conn, pattern)
.
A DBI::Id
object parsed from db_table (see details).
DBI::Id which this function wraps.
id("schema.table")
id("schema.table")
The function "interlaces" the queries and combines their validity time axes (valid_from and valid_until) onto a single time axis.
interlace(tables, by = NULL, colnames = NULL)
interlace(tables, by = NULL, colnames = NULL)
tables |
( |
by |
( |
colnames |
( |
The combination of input queries with a single, interlaced valid_from / valid_until time axis.
The combination of input queries with a single, interlaced valid_from / valid_until time axis
conn <- get_connection() t1 <- data.frame(key = c("A", "A", "B"), obs_1 = c(1, 2, 2), valid_from = as.Date(c("2021-01-01", "2021-02-01", "2021-01-01")), valid_until = as.Date(c("2021-02-01", "2021-03-01", NA))) t1 <- dplyr::copy_to(conn, df = t1, name = "t1") t2 <- data.frame(key = c("A", "B"), obs_2 = c("a", "b"), valid_from = as.Date(c("2021-01-01", "2021-01-01")), valid_until = as.Date(c("2021-04-01", NA))) t2 <- dplyr::copy_to(conn, df = t2, name = "t2") interlace(list(t1, t2), by = "key") close_connection(conn)
conn <- get_connection() t1 <- data.frame(key = c("A", "A", "B"), obs_1 = c(1, 2, 2), valid_from = as.Date(c("2021-01-01", "2021-02-01", "2021-01-01")), valid_until = as.Date(c("2021-02-01", "2021-03-01", NA))) t1 <- dplyr::copy_to(conn, df = t1, name = "t1") t2 <- data.frame(key = c("A", "B"), obs_2 = c("a", "b"), valid_from = as.Date(c("2021-01-01", "2021-01-01")), valid_until = as.Date(c("2021-04-01", NA))) t2 <- dplyr::copy_to(conn, df = t2, name = "t2") interlace(list(t1, t2), by = "key") close_connection(conn)
Checks if table contains historical data
is.historical(.data)
is.historical(.data)
.data |
( |
TRUE
if .data
contains the columns: "checksum", "from_ts", and "until_ts". FALSE
otherwise.
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) create_table(mtcars, conn, db_table = id("mtcars_historical", conn)) is.historical(get_table(conn, "mtcars")) # FALSE is.historical(get_table(conn, "mtcars_historical")) # TRUE close_connection(conn)
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) create_table(mtcars, conn, db_table = id("mtcars_historical", conn)) is.historical(get_table(conn, "mtcars")) # FALSE is.historical(get_table(conn, "mtcars_historical")) # TRUE close_connection(conn)
Overloads the dplyr *_join
to accept an na_by
argument.
By default, joining using SQL does not match on NA
/ NULL
.
dbplyr *_join
s has the option "na_matches = na" to match on NA
/ NULL
but this is very inefficient in some
cases.
This function does the matching more efficiently:
If a column contains NA
/ NULL
, the names of these columns can be passed via the na_by
argument and
efficiently match as if "na_matches = na".
If no na_by
argument is given is given, the function defaults to using dplyr::*_join
.
## S3 method for class 'tbl_sql' inner_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' left_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' right_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' full_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' semi_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' anti_join(x, y, by = NULL, ...)
## S3 method for class 'tbl_sql' inner_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' left_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' right_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' full_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' semi_join(x, y, by = NULL, ...) ## S3 method for class 'tbl_sql' anti_join(x, y, by = NULL, ...)
x , y
|
A pair of lazy data frames backed by database queries. |
by |
A join specification created with If To join on different variables between To join by multiple variables, use a
For simple equality joins, you can alternatively specify a character vector
of variable names to join by. For example, To perform a cross-join, generating all combinations of |
... |
Other parameters passed onto methods. |
Another tbl_lazy
. Use show_query()
to see the generated
query, and use collect()
to execute the query
and return data to R.
dplyr::mutate-joins which this function wraps.
dbplyr::join.tbl_sql which this function wraps.
library(dplyr, warn.conflicts = FALSE) library(dbplyr, warn.conflicts = FALSE) band_db <- tbl_memdb(dplyr::band_members) instrument_db <- tbl_memdb(dplyr::band_instruments) left_join(band_db, instrument_db) %>% show_query() # Can join with local data frames by setting copy = TRUE left_join(band_db, dplyr::band_instruments, copy = TRUE) # Unlike R, joins in SQL don't usually match NAs (NULLs) db <- memdb_frame(x = c(1, 2, NA)) label <- memdb_frame(x = c(1, NA), label = c("one", "missing")) left_join(db, label, by = "x") # But you can activate R's usual behaviour with the na_matches argument left_join(db, label, by = "x", na_matches = "na") # By default, joins are equijoins, but you can use `sql_on` to # express richer relationships db1 <- memdb_frame(x = 1:5) db2 <- memdb_frame(x = 1:3, y = letters[1:3]) left_join(db1, db2) %>% show_query() left_join(db1, db2, sql_on = "LHS.x < RHS.x") %>% show_query()
library(dplyr, warn.conflicts = FALSE) library(dbplyr, warn.conflicts = FALSE) band_db <- tbl_memdb(dplyr::band_members) instrument_db <- tbl_memdb(dplyr::band_instruments) left_join(band_db, instrument_db) %>% show_query() # Can join with local data frames by setting copy = TRUE left_join(band_db, dplyr::band_instruments, copy = TRUE) # Unlike R, joins in SQL don't usually match NAs (NULLs) db <- memdb_frame(x = c(1, 2, NA)) label <- memdb_frame(x = c(1, NA), label = c("one", "missing")) left_join(db, label, by = "x") # But you can activate R's usual behaviour with the na_matches argument left_join(db, label, by = "x", na_matches = "na") # By default, joins are equijoins, but you can use `sql_on` to # express richer relationships db1 <- memdb_frame(x = 1:5) db2 <- memdb_frame(x = 1:3, y = letters[1:3]) left_join(db1, db2) %>% show_query() left_join(db1, db2, sql_on = "LHS.x < RHS.x") %>% show_query()
The Logger
class facilitates logging to a database and/or file and to console.
A Logger
is associated with a specific table and timestamp which must be supplied at initialization.
This information is used to create the log file (if a log_path
is given) and the log entry in the database
(if a log_table_id
and log_conn
is given).
Logging to the database must match the fields in the log table.
A new instance of the Logger
R6 class.
output_to_console
(logical(1)
)
Should the Logger output to console? Read only.
This can always be overridden by Logger$log_info(..., output_to_console = FALSE).
log_path
(character(1)
)
The location log files are written (if this is not NULL). Defaults to getOption("SCDB.log_path")
. Read only.
log_tbl
(tbl_dbi(1)
)
The database table used for logging. Class is connection-specific, but inherits from tbl_dbi
. Read only.
start_time
(POSIXct(1)
)
The time at which data processing was started. Read only.
log_filename
(character(1)
)
The filename (basename) of the file that the Logger
instance will output to. Read only.
log_realpath
(character(1)
)
The full path to the logger's log file. Read only.
new()
Create a new Logger
object
Logger$new( db_table = NULL, timestamp = NULL, output_to_console = TRUE, log_table_id = getOption("SCDB.log_table_id"), log_conn = NULL, log_path = getOption("SCDB.log_path"), start_time = Sys.time(), warn = TRUE )
db_table
(id-like object(1)
)
A table specification (coercible by id()
) specifying the table being updated.
timestamp
(POSIXct(1)
, Date(1)
, or character(1)
)
A timestamp describing the data being processed (not the current time).
output_to_console
(logical(1)
)
Should the Logger output to console?
log_table_id
(id-like object(1)
)
A table specification (coercible by id()
) specifying the location of the log table.
log_conn
(DBIConnection(1)
)
A database connection where log table should exist.
log_path
(character(1)
)
The path where logs are stored.
If NULL
, no file logs are created.
start_time
(POSIXct(1)
)
The time at which data processing was started (defaults to Sys.time()
).
warn
(logical(1)
)
Should a warning be produced if no logging will be done?
log_info()
Write a line to log (console / file).
Logger$log_info( ..., tic = Sys.time(), output_to_console = self$output_to_console, log_type = "INFO", timestamp_format = getOption("SCDB.log_timestamp_format", "%F %R:%OS3") )
...
(character()
)
Character strings to be concatenated as log message.
tic
(POSIXct(1)
)
The timestamp used by the log entry.
output_to_console
(logical(1)
)
Should the line be written to console?
log_type
(character(1)
)
The severity of the log message.
timestamp_format
(character(1)
)
The format of the timestamp used in the log message (parsable by strftime()
).
Returns the log message invisibly
log_warn()
Write a warning to log file and generate warning.
Logger$log_warn(..., log_type = "WARNING")
...
(character()
)
Character strings to be concatenated as log message.
log_type
(character(1)
)
The severity of the log message.
log_error()
Write an error to log file and stop execution.
Logger$log_error(..., log_type = "ERROR")
...
(character()
)
Character strings to be concatenated as log message.
log_type
(character(1)
)
The severity of the log message.
log_to_db()
Write or update log table.
Logger$log_to_db(...)
...
(Name-value pairs
)
Structured data written to database log table. Name indicates column and value indicates value to be written.
finalize_db_entry()
Auto-fills "end_time" and "duration" for the log entry and clears the "log_file" field if no file is being written.
Logger$finalize_db_entry(end_time = Sys.time())
end_time
(POSIXct(1)
, Date(1)
, or character(1)
)
The end time for the log entry.
clone()
The objects of this class are cloneable with this method.
Logger$clone(deep = FALSE)
deep
Whether to make a deep clone.
logger <- Logger$new( db_table = "test.table", timestamp = "2020-01-01 09:00:00" ) logger$log_info("This is an info message") logger$log_to_db(message = "This is a message") try(logger$log_warn("This is a warning!")) try(logger$log_error("This is an error!"))
logger <- Logger$new( db_table = "test.table", timestamp = "2020-01-01 09:00:00" ) logger$log_info("This is an info message") logger$log_to_db(message = "This is a message") try(logger$log_warn("This is a warning!")) try(logger$log_error("This is an error!"))
The LoggerNull
class overwrites the functions of the Logger
so no logging is produced.
Errors and warnings are still produced.
A new instance of the LoggerNull
R6 class.
SCDB::Logger
-> LoggerNull
new()
Create a new LoggerNull
object
LoggerNull$new(...)
...
Captures arguments given, but does nothing
log_to_db()
Matches the signature of Logger$log_to_db()
, but does nothing.
LoggerNull$log_to_db(...)
...
Captures arguments given, but does nothing
finalize_db_entry()
Matches the signature of Logger$finalize_db_entry()
, but does nothing.
LoggerNull$finalize_db_entry(...)
...
Captures arguments given, but does nothing
clone()
The objects of this class are cloneable with this method.
LoggerNull$clone(deep = FALSE)
deep
Whether to make a deep clone.
logger <- LoggerNull$new() logger$log_info("This message will not print!") logger$log_to_db(message = "This message will no be written in database!") try(logger$log_warn("This is a warning!")) try(logger$log_error("This is an error!"))
logger <- LoggerNull$new() logger$log_info("This message will not print!") logger$log_to_db(message = "This message will no be written in database!") try(logger$log_warn("This is a warning!")) try(logger$log_error("This is an error!"))
nrow() but also works on remote tables
nrow(.data)
nrow(.data)
.data |
( |
The number of records in the object.
conn <- get_connection() m <- dplyr::copy_to(conn, mtcars) nrow(m) == nrow(mtcars) # TRUE close_connection(conn)
conn <- get_connection() m <- dplyr::copy_to(conn, mtcars) nrow(m) == nrow(mtcars) # TRUE close_connection(conn)
Test if a schema exists in given connection
schema_exists(conn, schema)
schema_exists(conn, schema)
conn |
( |
schema |
( |
TRUE if the given schema is found on conn
.
conn <- get_connection() schema_exists(conn, "test") close_connection(conn)
conn <- get_connection() schema_exists(conn, "test") close_connection(conn)
Slices a data object based on time / date
slice_time(.data, slice_ts, from_ts = "from_ts", until_ts = "until_ts")
slice_time(.data, slice_ts, from_ts = "from_ts", until_ts = "until_ts")
.data |
( |
slice_ts |
( |
from_ts , until_ts
|
( |
An object of same class as .data
conn <- get_connection() m <- mtcars %>% dplyr::mutate( "from_ts" = dplyr::if_else(dplyr::row_number() > 10, as.Date("2020-01-01"), as.Date("2021-01-01")), "until_ts" = as.Date(NA)) dplyr::copy_to(conn, m, name = "mtcars", temporary = FALSE) q <- dplyr::tbl(conn, id("mtcars", conn)) nrow(slice_time(q, "2020-01-01")) # 10 nrow(slice_time(q, "2021-01-01")) # nrow(mtcars) close_connection(conn)
conn <- get_connection() m <- mtcars %>% dplyr::mutate( "from_ts" = dplyr::if_else(dplyr::row_number() > 10, as.Date("2020-01-01"), as.Date("2021-01-01")), "until_ts" = as.Date(NA)) dplyr::copy_to(conn, m, name = "mtcars", temporary = FALSE) q <- dplyr::tbl(conn, id("mtcars", conn)) nrow(slice_time(q, "2020-01-01")) # 10 nrow(slice_time(q, "2021-01-01")) # nrow(mtcars) close_connection(conn)
This functions attempts to determine the existence of a given table. If a character input is given, matching is done heuristically assuming a "schema.table" notation. If no schema is implied in this case, the default schema is assumed.
table_exists(conn, db_table) ## S3 method for class 'DBIConnection' table_exists(conn, db_table)
table_exists(conn, db_table) ## S3 method for class 'DBIConnection' table_exists(conn, db_table)
conn |
( |
db_table |
( |
TRUE
if db_table can be parsed to a table found in conn
.
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) dplyr::copy_to(conn, iris, name = "iris") table_exists(conn, "mtcars") # TRUE table_exists(conn, "iris") # FALSE table_exists(conn, "temp.iris") # TRUE close_connection(conn)
conn <- get_connection() dplyr::copy_to(conn, mtcars, name = "mtcars", temporary = FALSE) dplyr::copy_to(conn, iris, name = "iris") table_exists(conn, "mtcars") # TRUE table_exists(conn, "iris") # FALSE table_exists(conn, "temp.iris") # TRUE close_connection(conn)
This function is heavily inspired by the unexported dbplyr function unique_table_name
unique_table_name(scope = "SCDB")
unique_table_name(scope = "SCDB")
scope |
( |
A character string for a table name based on the given scope parameter
print(unique_table_name()) # SCDB_<10 alphanumerical letters> print(unique_table_name()) # SCDB_<10 alphanumerical letters> print(unique_table_name("test")) # test_<10 alphanumerical letters> print(unique_table_name("test")) # test_<10 alphanumerical letters>
print(unique_table_name()) # SCDB_<10 alphanumerical letters> print(unique_table_name()) # SCDB_<10 alphanumerical letters> print(unique_table_name("test")) # test_<10 alphanumerical letters> print(unique_table_name("test")) # test_<10 alphanumerical letters>
Convenience function to paste together multiple columns into one.
unite.tbl_dbi(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
unite.tbl_dbi(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
data |
A data frame. |
col |
The name of the new column, as a string or symbol. This argument is passed by expression and supports
quasiquotation (you can unquote strings
and symbols). The name is captured from the expression with
|
... |
< |
sep |
Separator to use between values. |
remove |
If |
na.rm |
If |
A tbl_dbi with the specified columns united into a new column named according to "col".
separate()
, the complement.
library(tidyr, warn.conflicts = FALSE) df <- expand_grid(x = c("a", NA), y = c("b", NA)) unite(df, "z", x:y, remove = FALSE) # To remove missing values: unite(df, "z", x:y, na.rm = TRUE, remove = FALSE) # Separate is almost the complement of unite unite(df, "xy", x:y) %>% separate(xy, c("x", "y")) # (but note `x` and `y` contain now "NA" not NA)
library(tidyr, warn.conflicts = FALSE) df <- expand_grid(x = c("a", NA), y = c("b", NA)) unite(df, "z", x:y, remove = FALSE) # To remove missing values: unite(df, "z", x:y, na.rm = TRUE, remove = FALSE) # Separate is almost the complement of unite unite(df, "xy", x:y) %>% separate(xy, c("x", "y")) # (but note `x` and `y` contain now "NA" not NA)
update_snapshot()
makes it easy to create and update a historical data table on a remote (SQL) server.
The function takes the data (.data
) as it looks on a given point in time (timestamp
) and then updates
(or creates) an remote table identified by db_table
.
This update only stores the changes between the new data (.data
) and the data currently stored on the remote.
This way, the data can be reconstructed as it looked at any point in time while taking as little space as possible.
See vignette("basic-principles")
for further introduction to the function.
update_snapshot( .data, conn, db_table, timestamp, filters = NULL, message = NULL, tic = Sys.time(), logger = NULL, enforce_chronological_order = TRUE, collapse_continuous_records = FALSE )
update_snapshot( .data, conn, db_table, timestamp, filters = NULL, message = NULL, tic = Sys.time(), logger = NULL, enforce_chronological_order = TRUE, collapse_continuous_records = FALSE )
.data |
( |
conn |
( |
db_table |
( |
timestamp |
( |
filters |
( |
message |
( |
tic |
( |
logger |
( |
enforce_chronological_order |
( |
collapse_continuous_records |
( |
The most common use case is having consecutive snapshots of a dataset and wanting to store the changes between
them. If you have a special case where you want to insert data that is not consecutive, you can set the
enforce_chronological_order
to FALSE
. This will allow you to insert data that is earlier than the latest
time stamp.
If you have more updates in a single day and use Date()
rather than POSIXct()
, as your time stamp, you
may end up with records where from_ts
and until_ts
are equal. These records not normally accessible with
get_table()
and you may want to prevent these records using collapse_continuous_records = TRUE
.
No return value, called for side effects.
filter_keys
conn <- get_connection() data <- dplyr::copy_to(conn, mtcars) # Copy the first 3 records update_snapshot( head(data, 3), conn = conn, db_table = "test.mtcars", timestamp = Sys.time() ) # Update with the first 5 records update_snapshot( head(data, 5), conn = conn, db_table = "test.mtcars", timestamp = Sys.time() ) dplyr::tbl(conn, "test.mtcars") close_connection(conn)
conn <- get_connection() data <- dplyr::copy_to(conn, mtcars) # Copy the first 3 records update_snapshot( head(data, 3), conn = conn, db_table = "test.mtcars", timestamp = Sys.time() ) # Update with the first 5 records update_snapshot( head(data, 5), conn = conn, db_table = "test.mtcars", timestamp = Sys.time() ) dplyr::tbl(conn, "test.mtcars") close_connection(conn)