ted <- read.csv('ted.csv', stringsAsFactors=FALSE) %>% as.tibble a <- ted['tags'] %>% # extract the tags column split(seq(nrow(ted))) %>% unlist %>% unname # clean the letters. b <- a %>% sapply(function(x) strsplit(gsub("[^[:alnum:|, ]", "", x), ",")[[1]]) %>% unlist # create the tag names. tag_names <- unique(b) %>% trimws %>% sort # Go through the tag_names and for each add a column to the ted dataset # and make it true or false based on whether string is present in tag list new_ted <- ted rm(ted) # create row of tags and make clean it up. new_ted$tags <- sapply(a,function(x) paste(strsplit(gsub("[^[:alnum:] ]", ",", x), ",")[[1]], collapse=',')) for (i in tag_names) { new_col <- NULL for (row in 1:nrow(new_ted)) { if (length(grep(i, new_ted[row, 'tags'])) != 0) { # if we can grep the tags, put true. new_col[row] <- TRUE } else { new_col[row] <- FALSE } } new_ted <- cbind(new_ted, new_col) # add the columns colnames(new_ted)[length(colnames(new_ted))] <- paste0("TAG_", i) } a_done <- new_ted rm(new_ted) # b. Using the ratings column, create a new column for each rating category (14 in total). # The value will be the count for the associated category for each row. # For example, if the value is [{'id': 7, 'name': 'Funny', 'count': 19645}, {'id': 1, 'name': 'Beautiful', 'count': 4573}]. # Then the RATINGS_Funny column will be 19645 and RATINGS_Beautiful column will be 4573. # c. Using LASSO, fit a model using comments, duration, number of speakers (num_speaker), the tag data (TAGS_xxx), and the ratings data (RATINGS_xxx). The TAG_ columns are true and false based on if the # Splitting up ratings rating_names <- c("Inspiring", "Persuasive", "Courageous", "Fascinating", "Informative", "Ingenious", "Unconvincing", "Obnoxious", "Longwinded", "Jaw-dropping", "Confusing", "Funny", "Beautiful", "OK") for (rat_name in rating_names) { new_col <- rep(FALSE, nrow(a_done)) for (row in 1:nrow(a_done)) { q <- str_locate(a_done[row, 'ratings'], rat_name)[2] + 13 r <- substr(a_done[row, 'ratings'], q, nchar(a_done[row, 'ratings'])) s <- str_locate(r, '\\}')[2] new_col[row] <- as.integer(substr(r, 1, s - 1)) } a_done <- cbind(a_done, new_col) colnames(a_done)[length(colnames(a_done))] <- paste0("RATINGS_", rat_name) } b_done <- a_done nn <- c("description", "event", "film_date", "languages", "main_speaker", "name", "published_date", "ratings", "related_talks", "speaker_occupation", "tags", "title", "url") b_done[,!(colnames(b_done)%in%nn)] -> lasso_part nn <- c("description", "event", "film_date", "languages", "main_speaker", "name", "published_date", "ratings", "related_talks", "speaker_occupation", "tags", "title", "url") b_done[,!(colnames(b_done)%in%nn)] -> lasso_part