Skip to contents

Tidies UMCCR workflow results into a list of tibbles and writes individual tibbles to TSV, Parquet, SparkDF, or RDS format.

Usage

umccr_tidy(
  in_dir = NULL,
  out_dir = NULL,
  prefix = NULL,
  local_dir = NULL,
  out_format = "tsv",
  dryrun = FALSE,
  token = Sys.getenv("ICA_ACCESS_TOKEN"),
  pattern = NULL
)

Arguments

in_dir

Directory path to UMCCR workflow results (can be GDS, S3 or local).

out_dir

Output directory.

prefix

Prefix of output file(s).

local_dir

If indir is a GDS or S3 directory, 'recognisable' files will be first downloaded to this directory.

out_format

Format of output (tsv, parquet, both) (def: tsv).

dryrun

Just list the files that will be downloaded (def: FALSE).

token

ICA access token (by default uses $ICA_ACCESS_TOKEN env var).

pattern

Pattern to further filter the returned file type tibble (see name column in the DR_FILE_REGEX tibble).

Value

Tibble with path to input file and the resultant tidy object.

Examples

if (FALSE) {
in_dir <- file.path(
  "s3://umccr-primary-data-prod/UMCCR-Validation/SBJ00596",
  "ctTSO/2021-03-17/PTC_SSqCMM05pc_L2100067"
)
in_dir <- paste0(
  "gds://production/analysis_data/SBJ01639/tso_ctdna_tumor_only/",
  "202204045ad5743c/L2200214/Results/PRJ220425_L2200214"
)
o1 <- sub("^gds://", "", in_dir)
o1 <- sub("s3:/", "~/s3", in_dir)
out_dir <- o1
out_dir <- file.path(fs::path_home(), "icav1/g", o1)
# in_dir <- file.path(out_dir, "dracarys_gds_sync")
prefix <- "SBJ01639"
prefix <- "PTC_SSqCMM05pc_L2100067"
out_format <- "rds"
umccr_tidy(in_dir = in_dir, out_dir = out_dir, prefix = prefix, out_format = out_format, dryrun = F)

in_dir <- here::here(glue("nogit/tso/2022-12-13/SBJ02858/dracarys_gds_sync"))
out_dir <- file.path(in_dir, "../out")
gds_local_dir <- NULL
prefix <- "SBJ02858"
dryrun <- F
umccr_tidy(in_dir = in_dir, out_dir = out_dir, prefix = prefix, dryrun = F)
}