library( dplyr ) library( pander ) library( quanteda ) # library( quanteda.textmodels ) # library( quanteda.textstats ) # library( quanteda.textplots ) URL <- "https://github.com/DS4PS/cpp-527-spr-2020/blob/master/labs/data/IRS-1023-EZ-MISSIONS.rds?raw=true" dat <- readRDS(gzcon(url( URL ))) head( dat[ c("orgname","codedef01","mission") ] ) %>% pander() # convert missions to all lower-case dat$mission <- dat$mission %>% tolower() # use a sample for demo purposes dat.sample <- dat %>% sample_n( 1000 ) corp <- dat.sample %>% corpus( text_field="mission" ) (corp)[1:10,] %>% summary %>% knitr::kable(align="c") ########## ########## PRE-PROCESSING STEPS ########## # remove mission statements that # are less than 3 sentences long corp <- corp %>% corpus_trim( what="sentences", min_ntoken=3 ) # remove punctuation tokens <- corp %>% tokens( what="word", remove_punct=TRUE ) tokens %>% head() # remove filler words like # the, and, a, to tokens <- tokens %>% tokens_remove( c( stopwords("english"), "nbsp" ), padding=F ) ########## ########## DICTIONARIES ########## my_dictionary <- dictionary( list( five01_c_3 = c("501 c 3","section 501 c 3") , united_states = c("united states"), high_school = c("high school"), non_profit = c("non-profit", "non profit", "nonprofit"), stem = c("science technology engineering math", "science technology engineering mathematics" ), los_angeles = c("los angeles"), ny_state = c("new york state"), ny = c("new york") )) # apply the dictionary to the text tokens <- tokens %>% tokens_compound( pattern=my_dictionary ) tokens %>% head ########## ########## N-GRAMS ########## library( quanteda.textstats ) # find frequently co-occuring words # (typically compound words) ngram2 <- tokens %>% tokens_ngrams( n=2 ) %>% dfm() ngram2 %>% textstat_frequency( n=10 ) # requires quanteda.textstats library # 3-GRAMS ngram3 <- tokens %>% tokens_ngrams( n=3 ) %>% dfm() ngram3 %>% textstat_frequency( n=10 ) tokens %>% dfm() %>% topfeatures( ) ##### MEANINGFUL 2-GRAMS: ADD TO DICTIONARY feature frequency rank docfreq group 8 jesus_christ 15 7 15 all 23 youth_football 9 23 6 all 26 school_district 8 26 8 all 28 low_income 8 26 8 all 29 higher_education 8 26 8 all 30 special_needs 8 26 7 all 31 community_development 8 26 8 all 39 foster_care 7 36 6 all 42 mental_health 7 36 6 all ##### NOISY 2-GRAMS: IGNORE feature frequency rank docfreq group 1 mission_provide 26 1 26 all 3 organized_exclusively 19 3 19 all # organized exclusively for charitable purposes: 4-gram 4 exclusively_charitable 19 3 19 all # organized exclusively for charitable purposes: 4-gram 5 quality_life 18 5 18 all 32 mission_educate 8 26 8 all 40 organization_shall 7 36 7 all 9 corporation_organized 14 9 14 all ########## ########## STEMMING ########## # old version: # deprecated stem=T argument # tokens %>% # dfm( stem=T ) %>% # topfeatures() tokens %>% dfm() %>% topfeatures() # new version: stem=T tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures()