# Part of Study 1: Semantic priming # Preparation of final data set by merging several data sets from previous studies. # All primary data sets were last downloaded on 3 January 2022. # install.packages('httr') # Data download # install.packages('readxl') # Data reading # install.packages('readr') # Data reading # install.packages('dplyr') # Data wrangling # install.packages('purrr') # Data wrangling # install.packages('Matrix') # Data wrangling # install.packages('LSAfun') # Computation of cosine similarity (not using the LSA space but one described below) # install.packages('standardize') # Used to cluster-mean center within-participants variables library(httr) library(readxl) library(readr) library(dplyr) library(purrr) library(Matrix) library(LSAfun) library(standardize) # DATA SET 1. Lexical decision task from the Semantic Priming Project (Hutchison # et al., 2013; https://doi.org/10.3758/s13428-012-0304-z), downloaded from # https://www.montana.edu/attmemlab/documents/all%20ldt%20subs_all%20trials3.xlsx # Note on reproducibility # The code below was used to download and save the data set in the folder 'semanticpriming/data/primary_datasets'. # To prevent any influence from future changes to the original data set online, the code below was protected (i.e., # commented out) by inserting the snippet `# Protected code # ` at the beginning of each line. If desired, the code # can be run by removing the protective snippet from every line. # Protected code # GET('https://www.montana.edu/attmemlab/documents/all%20ldt%20subs_all%20trials3.xlsx', # Protected code # write_disk(tf <- tempfile(fileext = ".xlsx"))) # Protected code # read_excel(tf) %>% # Protected code # write.csv('semanticpriming/data/primary_datasets/Hutchison_etal_2013_semanticpriming_lexicaldecision.csv', # Protected code # row.names = FALSE) # Read in data set semanticpriming = read.csv('semanticpriming/data/primary_datasets/Hutchison_etal_2013_semanticpriming_lexicaldecision.csv') # Rename columns semanticpriming = semanticpriming %>% rename(Participant = Subject, target_word = target, prime_word = prime, interstimulus_interval = isi) # Only keep correctly-responded trials semanticpriming = semanticpriming[semanticpriming$target.ACC == '1',] # Remove responses faster than 200 ms or slower than 3,000 ms # (Hutchison et al., 2013; https://doi.org/10.3758/s13428-012-0304-z) semanticpriming = semanticpriming[!semanticpriming$target.RT < 200 & !semanticpriming$target.RT > 3000,] # Calculate number of letters per word semanticpriming$target_length = nchar(semanticpriming$target_word) # Recode dichotomous predictor 'interstimulus_interval' # (Brauer & Curtin, 2018; https://doi.org/10.1037/met0000159) semanticpriming$recoded_interstimulus_interval = ifelse(semanticpriming$interstimulus_interval == 50, -0.5, ifelse(semanticpriming$interstimulus_interval == 1050, 0.5, 0)) # Process lingering NA values in recoded_interstimulus_interval semanticpriming[ is.na(semanticpriming$recoded_interstimulus_interval), 'recoded_interstimulus_interval'] = 0 # View summary(semanticpriming$recoded_interstimulus_interval) table(semanticpriming$recoded_interstimulus_interval) # Keep first-associate and other-associate trials only, thus removing nonword trials (Yap et al., 2017; # https://www.montana.edu/attmemlab/documents/YHT_Individual_Differences_Priming.pdf). semanticpriming = semanticpriming[semanticpriming$type == 'first' | semanticpriming$type == 'other', ] %>% # Join prime and target words by a hyphen and save this into a new column, # which will be used as a grouping factor in the mixed-effects model. This # is necessary because every target word was preceded by four prime words # over different trials. mutate(primeword_targetword = paste0(prime_word, '_', target_word)) %>% # Select columns to keep select(Participant, prime_word, target_word, primeword_targetword, Session, Block, Trial, interstimulus_interval, recoded_interstimulus_interval, target_length, target.RT) str(semanticpriming) head(semanticpriming) # DATA SET 2. Participants' characteristics, downloaded from the Semantic Priming Project: # https://www.montana.edu/attmemlab/documents/LDT%20subject%20database.xlsx # Note on reproducibility # The code below was used to download and save the data set in the folder 'semanticpriming/data/primary_datasets'. # To prevent any influence from future changes to the original data set online, the code below was protected (i.e., # commented out) by inserting the snippet `# Protected code # ` at the beginning of each line. If desired, the code # can be run by removing the protective snippet from every line. # Protected code # GET('https://www.montana.edu/attmemlab/documents/LDT%20subject%20database.xlsx', # Protected code # write_disk(tf <- tempfile(fileext = ".xlsx"))) # Protected code # read_excel(tf) %>% # Protected code # write.csv('semanticpriming/data/primary_datasets/Yap_etal_2017_individual.csv', # Protected code # row.names = FALSE) # Read in data set Yap_etal_2017_individual = read.csv('semanticpriming/data/primary_datasets/Yap_etal_2017_individual.csv') # Calculate single vocabulary score by averaging across the synonym, antonym and analogy tests # (as in Yap et al., 2017; https://www.montana.edu/khutchison/documents/YHT%20in%20press.pdf). Yap_etal_2017_individual$vocabulary_size = (Yap_etal_2017_individual$vocaba + Yap_etal_2017_individual$vocabb + Yap_etal_2017_individual$vocabc) / 3 # Rename columns Yap_etal_2017_individual = Yap_etal_2017_individual %>% rename(Participant = SUBJECT, participant_gender = gender, attentional_control = ac) # Merge general data set and subject characteristics semanticpriming = merge(semanticpriming, Yap_etal_2017_individual[, c('Participant', 'participant_gender', 'vocabulary_size', 'attentional_control')], by = 'Participant') # Participant gender data semanticpriming %>% group_by(participant_gender) %>% tally(n_distinct(Participant)) # ^ Result # participant_gender n # # f 241 # F 57 # m 162 # M 49 # wf 1 # NA 2 # Recode dichotomous predictor 'participant_gender' (Brauer & Curtin, 2018; https://doi.org/10.1037/met0000159). # Male = -0.5, female = 0.5, others = 0 (N.B. the current data, shown above, does not contain sufficient # information to allow a more specific coding). semanticpriming$recoded_participant_gender = ifelse(semanticpriming$participant_gender == 'M' | semanticpriming$participant_gender == 'm', -0.5, ifelse(semanticpriming$participant_gender == 'F' | semanticpriming$participant_gender == 'f' | semanticpriming$participant_gender == 'wf', 0.5, 0)) # Process lingering NA values in recoded_participant_gender semanticpriming[is.na(semanticpriming$recoded_participant_gender), 'recoded_participant_gender'] = 0 # View summary(semanticpriming$recoded_participant_gender) semanticpriming %>% group_by(recoded_participant_gender) %>% tally(n_distinct(Participant)) # Free up workspace # rm(Yap_etal_2017_individual) # DATA SET 3. Lexical measures from the English Lexicon Project (Balota et al., 2007; https://doi.org/10.3758/BF03193014), # namely, number of syllables, orthographic Levenshtein distance (Yarkoni et al., 2008; https://doi.org/10.3758/PBR.15.5.971), # and phonological Levenshtein distance (Yap et al., 2009; https://doi.org/10.1016/j.jml.2009.02.001). # The two latter measures were created by the authors cited, and added into the English Lexicon Project. # First, the target words were saved in R into a CSV file as follows: write.csv(sort(unique(semanticpriming$target_word)), 'semanticpriming/data/primary_datasets/semanticpriming_targetwords.csv', row.names = FALSE) # Next, the above file was uploaded to https://elexicon.wustl.edu/query14/query14.html, # where the 'Method of Submission' selected was 'Filename Containing List of Words'. # The default output variables 'Length', 'Log_Freq_HAL' and 'Log_Freq_HAL' were # deselected, whereas the output variables 'LgSUBTLWF', 'OLD', 'PLD' and 'NSyll' were # selected. The query was executed and the resulting table was copy-pasted into the # txt file that is loaded in below. Balota_etal_2007_ELP_lexical = read.csv('semanticpriming/data/primary_datasets/Balota_etal_2007_ELP_lexical.csv') # Rename columns Balota_etal_2007_ELP_lexical = Balota_etal_2007_ELP_lexical %>% rename(target_word = Word, target_word_frequency = LgSUBTLWF, target_orthographic_Levenshtein_distance = OLD, target_phonological_Levenshtein_distance = PLD, target_number_syllables = NSyll) semanticpriming = merge(semanticpriming, Balota_etal_2007_ELP_lexical[, c('target_word', 'target_word_frequency', 'target_orthographic_Levenshtein_distance', 'target_phonological_Levenshtein_distance', 'target_number_syllables')], by = 'target_word') # Free up workspace # rm(Balota_etal_2007_ELP_lexical) # DATA SET 4. Prime-target linguistic cosine similarity # The semantic space chosen # Mandera et al. (2017; https://doi.org/10.1016/j.jml.2016.04.001) compared how different semantic spaces predicted # responses in the lexical decision task of the Semantic Priming Project (Hutchison et al., 2013). Table 5 in # Mandera et al. presents the explained variance achieved by each measure. The model using the most predictive # measure reached R^2 = .471. We tried to implement this measure (retrieved from 'english-lemmas-count-window.3-subtitle_en.zip' # at http://meshugga.ugent.be/snaut-downloads/spaces/english/count/), but it proved difficult because of the large # size of the semantic space. Specifically, the RAM required in R surpassed 90 Gigabytes, an amount of memory that # was difficult to reach even using a high-performance computing cluster, in which the job was queuing for several # days. The same difficulty when using the 'snaut' software instead of R (http://meshugga.ugent.be/snaut//download/). # Fortunately, as an alternative to the best semantic space, Table 5 in Mandera et al. (2017) shows that the # second-best semantic space in predicting the aforementioned responses achieved R^2 = .465. This semantic space # (retrieved from 'english-lemmas-cbow-window.6-dimensions.300-subtitle_en.w2v.gz' at # http://meshugga.ugent.be/snaut-downloads/spaces/english/predict/) was far smaller, allowing a feasible computation # even in a local machine. Therefore, the latter semantic space was downloaded below. # Note on reproducibility # The code below was used to download and save the data set in the folder 'semanticpriming/data/primary_datasets'. # To prevent any influence from future changes to the original data set online, the code below was protected (i.e., # commented out) by inserting the snippet `# Protected code # ` at the beginning of each line. If desired, the code # can be run by removing the protective snippet from every line. # Protected code # download.file('http://meshugga.ugent.be/snaut-downloads/spaces/english/predict/english-lemmas-cbow-window.6-dimensions.300-subtitle_en.w2v.gz', # Protected code # 'semanticpriming/data/primary_datasets/Mandera_etal_2017_english-lemmas-cbow-window.6-dimensions.300-subtitle_en.w2v.gz') # Read in data set as a matrix Mandera_etal_2017_semanticspace = data.matrix(read_delim( 'semanticpriming/data/primary_datasets/Mandera_etal_2017_english-lemmas-cbow-window.6-dimensions.300-subtitle_en.w2v.gz', col_names = FALSE, delim = ' ', # Skip first seven lines, as they have metadata skip = 7) %>% # Remove first column as it has the row names, whilst this matrix # must be fully numeric (row names to be added below) select(-1)) # Set words (first column) as row names dimnames(Mandera_etal_2017_semanticspace)[1] = read_delim( 'semanticpriming/data/primary_datasets/Mandera_etal_2017_english-lemmas-cbow-window.6-dimensions.300-subtitle_en.w2v.gz', col_names = FALSE, delim = ' ', # Skip first seven lines, as they have metadata skip = 7) %>% # Column containing the words select(1) # Correct column names (X1 to X300) colnames(Mandera_etal_2017_semanticspace) = paste0('X', rep(1:300)) # Obtain the unique combinations of prime and target words in the whole data set, discarding any repetitions wordpairs = data.frame( unique(semanticpriming[, c('prime_word', 'target_word')]), cosine_similarity = NA) # Compute measure for each prime-target pair (prime word turned lowercase) for(i in 1 : nrow(wordpairs)) { wordpairs[i, 'cosine_similarity'] = LSAfun::Cosine(x = tolower(wordpairs[i, 'prime_word']), y = wordpairs[i, 'target_word'], tvectors = Mandera_etal_2017_semanticspace) } # Add cosine_similarity to main data set semanticpriming = merge(semanticpriming, wordpairs, by = c('prime_word', 'target_word'), all.x = TRUE) # Free up workspace # rm(Mandera_etal_2017_semanticspace) # DATA SET 5. Lancaster Sensorimotor Norms (Lynott et al., 2020; https://doi.org/10.3758/s13428-019-01316-z), # downloaded from https://osf.io/48wsc/download?version=1 # Note on reproducibility # The code below was used to download and save the data set in the folder 'general_datasets'. Such a folder was used # because this data set was used in more than one study. To prevent any influence from future changes to the original # data set online, the code below was protected (i.e., commented out) by inserting the snippet `# Protected code # ` # at the beginning of each line. If desired, the code can be run by removing the protective snippet from every line. # Protected code # read.csv('https://osf.io/48wsc/download?version=1') %>% # Protected code # write.csv('general_datasets/Lynott_etal_2020_LancasterSensorimotorNorms.csv', # Protected code # row.names = FALSE) # Read in data set Lynott_etal_2020_LancasterSensorimotorNorms = read.csv('general_datasets/Lynott_etal_2020_LancasterSensorimotorNorms.csv') # Summary str(Lynott_etal_2020_LancasterSensorimotorNorms) # To match with Lancaster data set, make prime words lowercase semanticpriming$prime_word = tolower(semanticpriming$prime_word) Lynott_etal_2020_LancasterSensorimotorNorms$Word = tolower(Lynott_etal_2020_LancasterSensorimotorNorms$Word) # Number of words present in both the semanticpriming data set and # in the Lancaster Sensorimotor Norms. # Primes length(intersect(semanticpriming$prime_word, Lynott_etal_2020_LancasterSensorimotorNorms$Word)) # Targets length(intersect(semanticpriming$target_word, Lynott_etal_2020_LancasterSensorimotorNorms$Word)) # Import Lancaster norms data # first for prime words semanticpriming = merge(semanticpriming, Lynott_etal_2020_LancasterSensorimotorNorms %>% rename(prime_word = Word, prime_visual_rating = Visual.mean) %>% select(prime_word, prime_visual_rating), all.x = TRUE) # now for target words semanticpriming = merge(semanticpriming, Lynott_etal_2020_LancasterSensorimotorNorms %>% rename(target_word = Word, target_visual_rating = Visual.mean) %>% select(target_word, target_visual_rating), all.x = TRUE) # Calculate prime-target difference semanticpriming$visual_rating_diff = semanticpriming$prime_visual_rating - semanticpriming$target_visual_rating # Free up workspace # rm(Lynott_etal_2020_LancasterSensorimotorNorms) # DATA SET 6. Word concreteness (Brysbaert et al., 2014; https://doi.org/10.3758/s13428-013-0403-5), downloaded from: # https://static-content.springer.com/esm/art%3A10.3758%2Fs13428-013-0403-5/MediaObjects/13428_2013_403_MOESM1_ESM.xlsx # Note on reproducibility # The code below was used to download and save the data set in the folder 'general_datasets'. Such a folder was used # because this data set was used in more than one study. To prevent any influence from future changes to the original # data set online, the code below was protected (i.e., commented out) by inserting the snippet `# Protected code # ` # at the beginning of each line. If desired, the code can be run by removing the protective snippet from every line. # Protected code # GET('https://static-content.springer.com/esm/art%3A10.3758%2Fs13428-013-0403-5/MediaObjects/13428_2013_403_MOESM1_ESM.xlsx', # Protected code # write_disk(tf <- tempfile(fileext = ".xlsx"))) # Protected code # read_excel(tf) %>% # Protected code # write.csv('general_datasets/Brysbaert_etal_2014_wordconcreteness.csv', # Protected code # row.names = FALSE) # Read in data set Brysbaert_etal_2014_wordconcreteness = read.csv('general_datasets/Brysbaert_etal_2014_wordconcreteness.csv') # Import concreteness data # first for prime words semanticpriming = merge(semanticpriming, Brysbaert_etal_2014_wordconcreteness %>% rename(prime_word = Word, prime_word_concreteness = Conc.M) %>% select(prime_word, prime_word_concreteness), all.x = TRUE) # Revert the change of prime words into lowercase semanticpriming$prime_word = toupper(semanticpriming$prime_word) # now for target words semanticpriming = merge(semanticpriming, Brysbaert_etal_2014_wordconcreteness %>% rename(target_word = Word, target_word_concreteness = Conc.M) %>% select(target_word, target_word_concreteness), all.x = TRUE) # Calculate prime-target difference semanticpriming$word_concreteness_diff = semanticpriming$prime_word_concreteness - semanticpriming$target_word_concreteness # Number of prime-target pairs that have a word-concreteness difference score length(unique(semanticpriming[ !is.na(semanticpriming$word_concreteness_diff), 'primeword_targetword'])) # Number of prime-target pairs lacking a word-concreteness difference score length(unique(semanticpriming[ is.na(semanticpriming$word_concreteness_diff), 'primeword_targetword'])) # Free up workspace # rm(Brysbaert_etal_2014_wordconcreteness) # Set variable classes semanticpriming$cosine_similarity = as.numeric(semanticpriming$cosine_similarity) semanticpriming$target_word_frequency = as.numeric(semanticpriming$target_word_frequency) semanticpriming$target_orthographic_Levenshtein_distance = as.numeric(semanticpriming$target_orthographic_Levenshtein_distance) semanticpriming$target_phonological_Levenshtein_distance = as.numeric(semanticpriming$target_phonological_Levenshtein_distance) ####################################################################################################### # Remove NAs, necessary because the power analysis that is to be conducted using the 'simr' package # requires NA-free data (see https://github.com/pitakakariki/simr/issues/204). semanticpriming = na.omit(semanticpriming) str(semanticpriming) ####################################################################################################### # Trim RTs to 3 standard deviations within participants, within sessions and within # interstimulus interval conditions, as done in the Semantic Priming Project (Hutchison # et al., 2013; https://doi.org/10.3758/s13428-012-0304-z). # Create empty dataframe using column names from the original data set. new_semanticpriming = semanticpriming[0,] for(i in unique(semanticpriming$Participant)) { for(j in unique(semanticpriming$Session)) { for(k in unique(semanticpriming$interstimulus_interval)) { # First, select Participant, session and interstimulus interval condition subset = semanticpriming[semanticpriming$Participant == i & semanticpriming$Session == j & semanticpriming$interstimulus_interval == k,] result = subset[subset$target.RT > -(mean(subset$target.RT) + 3 * sd(subset$target.RT)) & subset$target.RT < mean(subset$target.RT) + 3 * sd(subset$target.RT),] new_semanticpriming = rbind(new_semanticpriming, result) } } } # View percentage of trials trimmed ((nrow(semanticpriming) - nrow(new_semanticpriming)) / nrow(semanticpriming)) * 100 # Apply change semanticpriming = new_semanticpriming # Z-score RT around each participant's own mean, following Faust et al. (1999; # also see Pexman et al., 2017; Pexman & Yap, 2018; Yap et al., 2012, 2017). semanticpriming$z_target.RT = scale_by(target.RT ~ Participant, semanticpriming) # Z-score between-participants predictors, following Brauer and Curtin (2018; # https://doi.org/10.1037/met0000159) semanticpriming$z_vocabulary_size = scale(semanticpriming$vocabulary_size) semanticpriming$z_attentional_control = scale(semanticpriming$attentional_control) semanticpriming$z_recoded_participant_gender = scale(semanticpriming$recoded_participant_gender) # Z-score between-items predictors around each participant's own mean, # following Brauer and Curtin (2018; https://doi.org/10.1037/met0000159) semanticpriming$z_target_word_frequency = scale_by(target_word_frequency ~ Participant, semanticpriming) semanticpriming$z_target_length = scale_by(target_length ~ Participant, semanticpriming) semanticpriming$z_target_number_syllables = scale_by(target_number_syllables ~ Participant, semanticpriming) semanticpriming$z_target_orthographic_Levenshtein_distance = scale_by(target_orthographic_Levenshtein_distance ~ Participant, semanticpriming) semanticpriming$z_target_phonological_Levenshtein_distance = scale_by(target_phonological_Levenshtein_distance ~ Participant, semanticpriming) semanticpriming$z_cosine_similarity = scale_by(cosine_similarity ~ Participant, semanticpriming) semanticpriming$z_visual_rating_diff = scale_by(visual_rating_diff ~ Participant, semanticpriming) semanticpriming$z_word_concreteness_diff = scale_by(word_concreteness_diff ~ Participant, semanticpriming) semanticpriming$z_recoded_interstimulus_interval = scale_by(recoded_interstimulus_interval ~ Participant, semanticpriming) # Select and order columns to be kept semanticpriming = semanticpriming %>% select(Participant, vocabulary_size, z_vocabulary_size, attentional_control, z_attentional_control, participant_gender, recoded_participant_gender, z_recoded_participant_gender, prime_word, target_word, primeword_targetword, Session, Block, Trial, interstimulus_interval, recoded_interstimulus_interval, z_recoded_interstimulus_interval, target_length, z_target_length, target_word_frequency, z_target_word_frequency, target_number_syllables, z_target_number_syllables, target_orthographic_Levenshtein_distance, z_target_orthographic_Levenshtein_distance, target_phonological_Levenshtein_distance, z_target_phonological_Levenshtein_distance, cosine_similarity, z_cosine_similarity, visual_rating_diff, z_visual_rating_diff, word_concreteness_diff, z_word_concreteness_diff, target.RT, z_target.RT) # Save final data set write.csv(semanticpriming, 'semanticpriming/data/final_dataset/semanticpriming.csv', row.names = FALSE) # Read back in # semanticpriming = read.csv('semanticpriming/data/final_dataset/semanticpriming.csv')