library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.14.4). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
##
## ar
library(bmlm)
library(bayesplot)
## This is bayesplot version 1.8.1
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(tidybayes)
##
## Attaching package: 'tidybayes'
## The following objects are masked from 'package:brms':
##
## dstudent_t, pstudent_t, qstudent_t, rstudent_t
We are going to work again with the PANDA dataset that you have now seen for a few weeks. However, in this challenge we are going to be combining cleaning, reformatting, and modeling from the past 4-weeks, so this should be a challenge!
Load the data from the SIPPS website: https://columbia-sipps.github.io/workshops/coding_advanced/. As a reminder, these data include a variable for ID and gender, and then assess self-worth (SPPA_SWORTH), communication quality with mothers (MPACS), and generalized anxiety (SCARED_GAD) every six months at 6 different time-points.
d <- read.csv('https://columbia-sipps.github.io/workshop_files/coding-adv-04/mlm-data-cleaning.csv')
As opposed to the week when we explored whether/how self-worth changed over time, today we are going to look at the association between communication quality with mothers and self-worth. Specifically, our question is at timepoints when adolescents reported better communication quality with their mothers (as compared to their own average communication quality across the 6 timepoints) did they concurrently report better/higher self-worth? Remember, that this is a within-person (as oppossed to a between-person) research question.
Keep ID, MPCAS variables and SWORTH variables in the dataset hint use the contains function for MPACS and SWORTH
d <- d %>%
select(ID, contains ("MPACS"), contains ("SWORTH"))
Make the variable names easier to understand
d <- d %>%
rename(
id = 1,
communication_1 = 2,
communication_2 = 3,
communication_3 = 4,
communication_4 = 5,
communication_5 = 6,
communication_6 = 7,
sworth_1 = 8,
sworth_2 = 9,
sworth_3 = 10,
sworth_4 = 11,
sworth_5 = 12,
sworth_6 = 13,
)
Here, you need to do this separately for self-worth and communication. That is, use the pivot_longer function like we did in the past for time and self-worth, and then create another long dataset for communication. hint: include include a command for names_prefix to make the time labels the same (and numeric) across the two dataframes you are creating
You will then need to merge the two dataframes using full_join by both id and time
#self-worth long dataset
d_long_sw <- d %>%
tidyr :: pivot_longer(., cols = contains("sworth"), names_to = "time",
names_prefix = "sworth_", values_to = "self_worth")
#communication long dataset
d_long_comm <- d %>%
tidyr :: pivot_longer(., cols = contains("comm"), names_to = "time",
names_prefix = "communication_", values_to = "communication")
#merge the two dataframes (using full_join)
#use select to keep the variables we want (id, time, self-worth and communication)
d_long <- full_join(d_long_sw, d_long_comm, by = c("id", "time")) %>%
select(id, time, self_worth, communication)
- filter out the NA responses for self_worth (use !is.na)
- filter out the NA responses for communication
d_long <- d_long %>%
dplyr::filter(!is.na(self_worth)) %>%
dplyr::filter(!is.na(communication))
Make it so that timepoint 1 is recoded tp be 0, so that time ranges from 0 to 5.
d_long$time <- as.numeric(d_long$time)
d_long$time <- d_long$time - 1
Self worth was previously on a 1 to 4 scale Communication was on a 20 to 100 scale
There are numerous ways to do this, but one way is to implement this equation: (max new - min new)/(max old - min old) * (variable - max old) + max new
d_long$self_worth <- 10/4 * (d_long$self_worth - 4) + 10
d_long$communication <- 10/100 * (d_long$communication - 100) + 10
hint, use the isolate function
d_long <- isolate(d_long, by = "id", value = "communication", which = "both")
Self-worth should be predicted by time and within-subject communication quality. There should be a random intercept, as well as a random slope for within0subject communication quality.
m1 <- brm(self_worth ~ time + communication_cw + (1 + communication_cw | id), data = d_long)
summary(m1)
- Time: self-worth for the typical adolescent decreases slightly over time. Specifically, there is a .13 unit decrease in self-worth (on a 10-point scale) every 6-months. This is statistically significant.
- Communication: At timepoints when adolescents report higher communication quality with their mothers, as compared to their average communication quality across the study period, there is an associated .40 unit decrease in self-worth (on a 10-point scale). This is also statistically significant.
hint, when quantifying the heterogeneity interval use the formula: fixed effect +/- (plus or minus) 2 times sd of random effects >- There is significant heterogeneity (variability) in the association between within-person communication quality and self-worth.95% of the slopes fall between the range of -.34 and 1.14 (.40 +/- (2* .37) meaning that for some people, they report WORSE self-worth at times when they report better communication with their mothers, and for other people, they experience a stronger positive association between person-specific increases in communication quality and self-worth than the average participant.
Plot a spaghetti plot
#pull out the random effects
ranefs <- ranef(m1)
#make it into a dataframe
ranefs <- data.frame(as.numeric(rownames(ranefs$id)),ranefs$id)
ggplot(d_long, aes(communication_cw, self_worth)) +
#pulling in person specific intercepts and slopes
geom_abline(ranefs,
intercept = ranefs$Estimate.Intercept + 8.1807947,
slope = ranefs$Estimate.communication_cw + 0.3985144,
color="black", lwd = .3, alpha = .3) +
#pulling in fixed effects (average line)
geom_abline(ranefs, intercept = 8.1807947, slope = 0.3985144,
color="red", lwd = 1.5, alpha = 1) +
xlim(-4, 5) + ylim(0, 10) +
theme(text=element_text(size=16))
## Warning: geom_abline(): Ignoring `mapping` because `slope` and/or `intercept`
## were provided.
## Warning: geom_abline(): Ignoring `mapping` because `slope` and/or `intercept`
## were provided.
Make sure to visualize where the fixed effect is as well.
#add the random effects of each person to the fixed effect to get person specific slopes
ranefs$ind_slopes <- ranefs$Estimate.communication_cw + 0.3985144
ggplot(ranefs, aes(x = ind_slopes)) + geom_histogram(color = "blue", fill = "blue",
alpha = .4, bins = 40) +
#fixed effect line
geom_vline(xintercept = 0.3985144, size = 1, color = "black", linetype="dashed") +
#create the 95% credibility interval (fixed effect +/- 1.96 * sd of random effect)
geom_vline(xintercept = 0.3985144 - 1.96*.37, size = .5, color = "red") +
geom_vline(xintercept = 0.3985144 + 1.96*.37, size = .5, color = "red") +
theme_bw() + theme(text = element_text(size=12))