Workflow is a base R6 class representing a bioinformatic workflow run from a UMCCR workflow manager.
A workflow has:
a directory path with all the raw output files (either on GDS, S3 or local filesystem)
a subset of files that are of interest for ingestion
tibble with full path and basename columns
a set of parsers that can parse and tidy those files
each parser takes a path and returns a tidy tibble
a list of tidy tibbles (or a tibble with nested tibbles)
Public fields
pathPath to directory with raw workflow results (from GDS, S3, or local filesystem).
wnameName of workflow (e.g. umccrise, sash).
filesystemFilesystem of
path(gds/s3/local).regexesTibble with file
regexandfunction to parse it.
Methods
Method list_files()
List all files under given path.
Usage
Wf$list_files(
path = self$path,
max_files = 1000,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
...
)Method list_files_filter_relevant()
List dracarys files under given path
Usage
Wf$list_files_filter_relevant(
path = self$path,
max_files = 1000,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
...
)Method download_files()
Download files from GDS/S3 to local filesystem.
Usage
Wf$download_files(
path = self$path,
outdir,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
max_files = 1000,
dryrun = FALSE,
recursive = NULL
)Arguments
pathPath with raw results.
outdirPath to output directory.
ica_tokenICA access token (def: $ICA_ACCESS_TOKEN env var).
max_filesMax number of files to list.
dryrunIf TRUE, just list the files that will be downloaded (don't download them).
recursiveShould files be returned recursively in and under the specified GDS directory, or only directly in the specified GDS directory (def: TRUE via ICA API).
Method tidy_files()
Tidy given files.
Method write()
Write tidy data.
Examples
if (FALSE) { # \dontrun{
regexes <- tibble::tribble(
~regex, ~fun,
"-chord\\.tsv\\.gz$", "UmChordTsvFile",
"-hrdetect\\.tsv\\.gz$", "UmHrdetectTsvFile",
"-snv_2015\\.tsv\\.gz$", "UmSigsSnvFile",
"-snv_2020\\.tsv\\.gz$", "UmSigsSnvFile",
"-dbs\\.tsv\\.gz$", "UmSigsDbsFile",
"-indel\\.tsv\\.gz$", "UmSigsIndelFile",
"-qc_summary\\.tsv\\.gz$", "UmQcSumFile",
)
#---- LOCAL ----#
p1_local <- "~/icav1/g/production/analysis_data"
p <- file.path(p1_local, "SBJ01155/umccrise/202408300c218043/L2101566__L2101565")
um1 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
um1$list_files(max_files = 10)
um1$list_files_filter_relevant(max_files = 10)
#---- GDS ----#
p1_gds <- "gds://production/analysis_data"
p <- file.path(p1_gds, "SBJ03043/umccrise/20240830ec648f40/L2300064__L2300063")
outdir <- file.path(sub("gds:/", "~/icav1/g", p))
token <- Sys.getenv("ICA_ACCESS_TOKEN")
um2 <- Wf$new(path = p, wname = "umccrise", regexes = regexes)
um2$list_files(max_files = 10)
um2$list_files_filter_relevant(ica_token = token, max_files = 500)
d <- um2$download_files(
outdir = outdir, ica_token = token,
max_files = 1000, dryrun = T
)
d_tidy <- um2$tidy_files(d)
#---- S3 ----#
p1_s3 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05570/sash/202408275fce06c3"
p2_s3 <- "L2401304_L2401303/SBJ05570_MDX240299/cancer_report/cancer_report_tables"
p <- file.path(p1_s3, p2_s3)
outdir <- sub("s3:/", "~/s3", p)
um3 <- Wf$new(path = p, wname = "sash", regexes = regexes)
um3$list_files(max_files = 10)
um3$list_files_filter_relevant(max_files = 50)
d <- um3$download_files(outdir = outdir, regexes = regexes, max_files = 50, dryrun = F)
} # }