build a column of data that maps baseline values to every row. baseline values starting points or reference points for each subject, e.g. a value of a measurement at time zero. The method will produce a mean value if multiple entries match the subjectID and visitID conditions.

fillBaselineColumn(
  mxdfin,
  columnName,
  subjectID,
  visitID,
  baselineVisitValue,
  baselineExt = "_BL",
  deltaExt = "_delta",
  fast = TRUE,
  verbose = TRUE
)

Arguments

mxdfin

Input data frame with repeated measurements and a grouped time variable

columnName

string or vector of strings defining valid columns in the data frame to process.

subjectID

the unique subject id column name

visitID

names of the column that defines the variable defining baseline-ness. e.g. isBaseline

baselineVisitValue

the value defining baseline e.g. TRUE

baselineExt

string appended to column name defining the baseline variable

deltaExt

string appended to column name defining the change variable

fast

boolean; if TRUE, uses a faster, column-wise approach (requires data.table-like operations). Will only return subjects with baseline values. If there are several baseline entries, these will be averaged (for numeric) or the mode will be taken (for non-numeric). Only works robustly with atomic data types (numeric, character, factor).

verbose

boolean to print progress messages

Value

data frame with new columns (or a list containing it if deltaExt is NA)

Author

Avants BB

Examples

# Generate dummy data for an example
generateSubtyperData <- function(n_subjects = 10, n_visits = 5) {
  set.seed(123)
  subjects <- paste0("Sub_", 1:n_subjects)
  df <- data.frame(
    commonID = rep(subjects, each = n_visits),
    yearsbl = rep(0:(n_visits - 1), n_subjects), # Assuming 0 is baseline
    RandomBasisProjection01 = rnorm(n_subjects * n_visits, mean = 10, sd = 2),
    CategoricalVar = sample(c("A", "B", "C"), n_subjects * n_visits, replace = TRUE),
    stringsAsFactors = FALSE
  )
  # Introduce some missing baseline data for testing 'fast' mode filtering
  df$RandomBasisProjection01[sample(1:nrow(df), 5)] <- NA
  # Introduce multiple baselines for some subjects
  extra_baseline_subject <- df[df$commonID == subjects[1] & df$yearsbl == 0,]
  extra_baseline_subject$yearsbl <- 0.001 # slightly different baseline visit
  df <- rbind(df, extra_baseline_subject)

  return(df)
}

# Example 1: Single numeric column with fast mode
mydf <- generateSubtyperData(10)
mydog <- fillBaselineColumn(
  mxdfin = mydf,
  columnName = "RandomBasisProjection01",
  subjectID = 'commonID',
  visitID = 'yearsbl',
  baselineVisitValue = 0,
  fast = TRUE,
  verbose = TRUE
)[[1]]
#> Using fast path...
#> Filtering mxdfin to subjects with baseline data.
#> Replicating baseline rows...
#> Filling columns...
#> Fill done.
# Check a subject with multiple baselines (Sub_1, has duplicate for yearsbl=0)
# Note: The example data generation for multiple baselines might need manual adjustment
# to realistically simulate it for 'aggregate' to take effect.
# For instance, if Sub_1 has two 'yearsbl' == 0 entries with different
# RandomBasisProjection01 values.

# Example 2: Multiple columns (numeric and categorical) with fast mode
mydf2 <- generateSubtyperData(10)
mydog2 <- fillBaselineColumn(
  mxdfin = mydf2,
  columnName = c("RandomBasisProjection01", "CategoricalVar"), # vars2bl from your example
  subjectID = 'commonID',
  visitID = 'yearsbl',
  baselineVisitValue = 0,
  fast = TRUE,
  verbose = FALSE
)[[1]]
head(mydog2)
#>    commonID yearsbl RandomBasisProjection01 CategoricalVar
#> 1     Sub_1   0.000                8.879049              A
#> 2     Sub_1   1.000                9.539645              B
#> 3     Sub_1   2.000               13.117417              A
#> 4     Sub_1   3.000               10.141017              B
#> 5     Sub_1   4.000               10.258575              A
#> 51    Sub_1   0.001                8.879049              A
#>    RandomBasisProjection01_BL CategoricalVar_BL RandomBasisProjection01_delta
#> 1                    8.879049                 A                     0.0000000
#> 2                    8.879049                 A                     0.6605963
#> 3                    8.879049                 A                     4.2383679
#> 4                    8.879049                 A                     1.2619681
#> 5                    8.879049                 A                     1.3795268
#> 51                   8.879049                 A                     0.0000000
#>    CategoricalVar_delta
#> 1                    NA
#> 2                    NA
#> 3                    NA
#> 4                    NA
#> 5                    NA
#> 51                   NA

# Example 3: Fast mode, specific scenario if a subject entirely lacks baseline
# mydf3 <- generateSubtyperData(5)
# mydf3 <- mydf3[!(mydf3$commonID == "Sub_1" & mydf3$yearsbl == 0), ] # Remove Sub_1's baseline
# mydog3 <- fillBaselineColumn(
#   mxdfin = mydf3,
#   columnName = "RandomBasisProjection01",
#   subjectID = 'commonID',
#   visitID = 'yearsbl',
#   baselineVisitValue = 0,
#   fast = TRUE,
#   verbose = TRUE
# )[[1]]
# print(unique(mydog3$commonID)) # Sub_1 should be absent

# Example 4: Slow mode (default)
mydf_slow <- generateSubtyperData(5)
mydog_slow <- fillBaselineColumn(
  mxdfin = mydf_slow,
  columnName = "RandomBasisProjection01",
  subjectID = 'commonID',
  visitID = 'yearsbl',
  baselineVisitValue = 0,
  fast = FALSE, # Explicitly use the slow path
  verbose = TRUE
)[[1]]
#> Using slow path...
#> 0%. 100%.
head(mydog_slow)
#>   commonID yearsbl RandomBasisProjection01 CategoricalVar
#> 1    Sub_1       0                8.879049              A
#> 2    Sub_1       1                9.539645              C
#> 3    Sub_1       2               13.117417              A
#> 4    Sub_1       3               10.141017              C
#> 5    Sub_1       4               10.258575              B
#> 6    Sub_2       0               13.430130              A
#>   RandomBasisProjection01_BL RandomBasisProjection01_delta
#> 1                   8.879049                     0.0000000
#> 2                   8.879049                     0.6605963
#> 3                   8.879049                     4.2383679
#> 4                   8.879049                     1.2619681
#> 5                   8.879049                     1.3795268
#> 6                  13.430130                     0.0000000