fill baseline column — fillBaselineColumn • subtyper

build a column of data that maps baseline values to every row. baseline values starting points or reference points for each subject, e.g. a value of a measurement at time zero. The method will produce a mean value if multiple entries match the subjectID and visitID conditions.

fillBaselineColumn(
  mxdfin,
  columnName,
  subjectID,
  visitID,
  baselineVisitValue,
  baselineExt = "_BL",
  deltaExt = "_delta",
  fast = TRUE,
  verbose = TRUE
)

Arguments

mxdfin: Input data frame with repeated measurements and a grouped time variable
columnName: string or vector of strings defining valid columns in the data frame to process.
subjectID: the unique subject id column name
visitID: names of the column that defines the variable defining baseline-ness. e.g. isBaseline
baselineVisitValue: the value defining baseline e.g. TRUE
baselineExt: string appended to column name defining the baseline variable
deltaExt: string appended to column name defining the change variable
fast: boolean; if TRUE, uses a faster, column-wise approach (requires data.table-like operations). Will only return subjects with baseline values. If there are several baseline entries, these will be averaged (for numeric) or the mode will be taken (for non-numeric). Only works robustly with atomic data types (numeric, character, factor).
verbose: boolean to print progress messages

Value

data frame with new columns (or a list containing it if deltaExt is NA)

Author

Avants BB

Examples

# Generate dummy data for an example
generateSubtyperData <- function(n_subjects = 10, n_visits = 5) {
  set.seed(123)
  subjects <- paste0("Sub_", 1:n_subjects)
  df <- data.frame(
    commonID = rep(subjects, each = n_visits),
    yearsbl = rep(0:(n_visits - 1), n_subjects), # Assuming 0 is baseline
    RandomBasisProjection01 = rnorm(n_subjects * n_visits, mean = 10, sd = 2),
    CategoricalVar = sample(c("A", "B", "C"), n_subjects * n_visits, replace = TRUE),
    stringsAsFactors = FALSE
  )
  # Introduce some missing baseline data for testing 'fast' mode filtering
  df$RandomBasisProjection01[sample(1:nrow(df), 5)] <- NA
  # Introduce multiple baselines for some subjects
  extra_baseline_subject <- df[df$commonID == subjects[1] & df$yearsbl == 0,]
  extra_baseline_subject$yearsbl <- 0.001 # slightly different baseline visit
  df <- rbind(df, extra_baseline_subject)

  return(df)
}

# Example 1: Single numeric column with fast mode
mydf <- generateSubtyperData(10)
mydog <- fillBaselineColumn(
  mxdfin = mydf,
  columnName = "RandomBasisProjection01",
  subjectID = 'commonID',
  visitID = 'yearsbl',
  baselineVisitValue = 0,
  fast = TRUE,
  verbose = TRUE
)[[1]]
#> Using fast path...
#> Filtering mxdfin to subjects with baseline data.
#> Replicating baseline rows...
#> Filling columns...
#> Fill done.
# Check a subject with multiple baselines (Sub_1, has duplicate for yearsbl=0)
# Note: The example data generation for multiple baselines might need manual adjustment
# to realistically simulate it for 'aggregate' to take effect.
# For instance, if Sub_1 has two 'yearsbl' == 0 entries with different
# RandomBasisProjection01 values.

# Example 2: Multiple columns (numeric and categorical) with fast mode
mydf2 <- generateSubtyperData(10)
mydog2 <- fillBaselineColumn(
  mxdfin = mydf2,
  columnName = c("RandomBasisProjection01", "CategoricalVar"), # vars2bl from your example
  subjectID = 'commonID',
  visitID = 'yearsbl',
  baselineVisitValue = 0,
  fast = TRUE,
  verbose = FALSE
)[[1]]
head(mydog2)
#>    commonID yearsbl RandomBasisProjection01 CategoricalVar
#> 1     Sub_1   0.000                8.879049              A
#> 2     Sub_1   1.000                9.539645              B
#> 3     Sub_1   2.000               13.117417              A
#> 4     Sub_1   3.000               10.141017              B
#> 5     Sub_1   4.000               10.258575              A
#> 51    Sub_1   0.001                8.879049              A
#>    RandomBasisProjection01_BL CategoricalVar_BL RandomBasisProjection01_delta
#> 1                    8.879049                 A                     0.0000000
#> 2                    8.879049                 A                     0.6605963
#> 3                    8.879049                 A                     4.2383679
#> 4                    8.879049                 A                     1.2619681
#> 5                    8.879049                 A                     1.3795268
#> 51                   8.879049                 A                     0.0000000
#>    CategoricalVar_delta
#> 1                    NA
#> 2                    NA
#> 3                    NA
#> 4                    NA
#> 5                    NA
#> 51                   NA

# Example 3: Fast mode, specific scenario if a subject entirely lacks baseline
# mydf3 <- generateSubtyperData(5)
# mydf3 <- mydf3[!(mydf3$commonID == "Sub_1" & mydf3$yearsbl == 0), ] # Remove Sub_1's baseline
# mydog3 <- fillBaselineColumn(
#   mxdfin = mydf3,
#   columnName = "RandomBasisProjection01",
#   subjectID = 'commonID',
#   visitID = 'yearsbl',
#   baselineVisitValue = 0,
#   fast = TRUE,
#   verbose = TRUE
# )[[1]]
# print(unique(mydog3$commonID)) # Sub_1 should be absent

# Example 4: Slow mode (default)
mydf_slow <- generateSubtyperData(5)
mydog_slow <- fillBaselineColumn(
  mxdfin = mydf_slow,
  columnName = "RandomBasisProjection01",
  subjectID = 'commonID',
  visitID = 'yearsbl',
  baselineVisitValue = 0,
  fast = FALSE, # Explicitly use the slow path
  verbose = TRUE
)[[1]]
#> Using slow path...
#> 0%. 100%.
head(mydog_slow)
#>   commonID yearsbl RandomBasisProjection01 CategoricalVar
#> 1    Sub_1       0                8.879049              A
#> 2    Sub_1       1                9.539645              C
#> 3    Sub_1       2               13.117417              A
#> 4    Sub_1       3               10.141017              C
#> 5    Sub_1       4               10.258575              B
#> 6    Sub_2       0               13.430130              A
#>   RandomBasisProjection01_BL RandomBasisProjection01_delta
#> 1                   8.879049                     0.0000000
#> 2                   8.879049                     0.6605963
#> 3                   8.879049                     4.2383679
#> 4                   8.879049                     1.2619681
#> 5                   8.879049                     1.3795268
#> 6                  13.430130                     0.0000000