In this example, we detour and learn a few SQL techniques. SQL is expressive and declarative.
-- you can always select 1 integer value regardless
select 1;
-- and perform an arithmetic operation
select 1 + 1;
-- and select some string
select 'some string';
-- this comes in handy when you want to create an inline label column
select
(select 'some string') as MY_LABEL_COLUMN,
*
from lims
limit 10;
-- and transform some date-formatted string into a date, called "type casting"
select cast('2025-02-01' as date);
-- and combine two select results
select 1
union
select 2;
-- and compare (diff) two select results
select 1
except
select 1;
-- return 0 as no differences
To put into use, for example, the requested longitudinal re-analysis cases.
-- find me LIMS records for given SampleID and label them
select
(select 'Patient 1') as patient_label,
*
from lims
where
sample_id in ('MDX190012', 'MDX190003', 'MDX220074', 'MDX250208')
union all
select
(select 'Patient 2') as patient_label,
*
from lims
where
sample_id in ('MDX200051', 'MDX200052', 'MDX250209');
You can leverage database function where supported. There are plenty of functions supported by AWS Athena as well as PostgreSQL.
-- select some string into all uppercase
select upper('some string');
-- select the year component of a date
select year(cast('2025-02-01' as date));
https://neon.tech/postgresql/postgresql-tutorial/postgresql-subquery
-- you can stack up a query within a query, hence, subquery
select
*
from workflow
where
library_id in (
select distinct library_id from lims where internal_subject_id in ('SBJ00057', 'SBJ00306')
)
and workflow_status = 'SUCCEEDED';
https://neon.tech/postgresql/postgresql-tutorial/postgresql-joins
-- or join the two tables on the candidate column(s) that have them merged
select
*
from workflow
join lims on lims.library_id = workflow.library_id
where
workflow.workflow_status = 'SUCCEEDED'
and lims.internal_subject_id in ('SBJ00057', 'SBJ00306');
Recall Venn diagram and Set theory :). And the cheat sheet.
https://neon.tech/postgresql/postgresql-tutorial/postgresql-cte
-- how to remember something so that you can reference it in the next step
with cte1 as (
select 100 as MY_VALUE
)
select * from cte1;
-- you can chain the CTE blocks
with cte1 as (
select 100 as MY_VALUE
),
cte2 as (
select MY_VALUE * 2 as MY_VALUE_2X from cte1
)
select * from cte2;
-- CTE can be syntactic sugar
with requested as (
select library_id from lims where internal_subject_id in ('SBJ00057', 'SBJ00306')
),
succeeded as (
select * from workflow where library_id in (
select distinct library_id from requested
) and workflow_status = 'SUCCEEDED'
)
select * from succeeded;
-- CTE can be put into powerful use
with cte1 as (
select
'ICAv1' as platform_version,
avg(workflow_duration) as tn_avg_runtime_seconds
from workflow
where
workflow_status = 'SUCCEEDED'
and workflow_name = 'wgs_tumor_normal'
),
cte2 as (
select
'ICAv2' as platform_version,
avg(workflow_duration) as tn_avg_runtime_seconds
from workflow
where
workflow_status = 'SUCCEEDED'
and workflow_name = 'tumor-normal'
and workflow_start > cast('2025-02-01' as date) -- trim outlier
)
select * from cte1
union
select * from cte2;