Workflow is a base R6 class representing a bioinformatic workflow run from a UMCCR workflow manager.
A workflow has:
a directory path with all the raw output files (either on GDS, S3 or local filesystem)
a subset of files that are of interest for ingestion
tibble with full path and basename columns
a set of parsers that can parse and tidy those files
each parser takes a path and returns a tidy tibble
a list of tidy tibbles (or a tibble with nested tibbles)
Public fields
path
Path to directory with raw workflow results (from GDS, S3, or local filesystem).
wname
Name of workflow (e.g. umccrise, sash).
filesystem
Filesystem of
path
(gds/s3/local).regexes
Tibble with file
regex
andfun
ction to parse it.
Methods
Method list_files()
List all files under given path.
Usage
Wf$list_files(
path = self$path,
max_files = 1000,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
...
)
Method list_files_filter_relevant()
List dracarys files under given path
Usage
Wf$list_files_filter_relevant(
path = self$path,
max_files = 1000,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
...
)
Method download_files()
Download files from GDS/S3 to local filesystem.
Usage
Wf$download_files(
path = self$path,
outdir,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
max_files = 1000,
dryrun = FALSE,
recursive = NULL
)
Arguments
path
Path with raw results.
outdir
Path to output directory.
ica_token
ICA access token (def: $ICA_ACCESS_TOKEN env var).
max_files
Max number of files to list.
dryrun
If TRUE, just list the files that will be downloaded (don't download them).
recursive
Should files be returned recursively in and under the specified GDS directory, or only directly in the specified GDS directory (def: TRUE via ICA API).
Method tidy_files()
Tidy given files.
Method write()
Write tidy data.
Examples
if (FALSE) { # \dontrun{
regexes <- tibble::tribble(
~regex, ~fun,
"-chord\\.tsv\\.gz$", "UmChordTsvFile",
"-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
"-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
"-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
"-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
"-indel\\.tsv\\.gz$", "UmSigsIndelFile",
"-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
)
#---- LOCAL ----#
p1_local <- "~/icav1/g/production/analysis_data"
p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
um1$list_files(max_files = 10)
um1$list_files_filter_relevant(max_files = 10)
#---- GDS ----#
p1_gds <- "gds://production/analysis_data"
p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
outdir <- file.path(sub("gds:/", "~/icav1/g", p))
token <- Sys.getenv("ICA_ACCESS_TOKEN")
um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
um2$list_files(max_files = 10)
um2$list_files_filter_relevant(ica_token = token, max_files = 500)
d <- um2$download_files(
outdir = outdir, ica_token = token,
max_files = 1000, dryrun = T
)
d_tidy <- um2$tidy_files(d)
#---- S3 ----#
p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables"
p <- file.path(p1_s3, p2_s3)
outdir <- sub("s3:/", "~/s3", p)
um3 <- Wf$new(path = p, wname = "sash", regexes = regexes)
um3$list_files(max_files = 10)
um3$list_files_filter_relevant(max_files = 50)
d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F)
} # }