Section 3 Token Analysis

This chapter contains the code for wrangling the cleaned and tokenized data into data-frames containing word frequencies and visualizing the data using the raw counts derived from this process.

knitr::opts_chunk$set(message = FALSE, warning = FALSE, echo = TRUE)
# set to TRUE to run this on only one reference file
debugging <- FALSE
# this will expect the file or files to be in a subdirectory with the following name
refsource <- "MungingProj2"
dataDir <- "Proj2Data"
workingDir <- refsource

# prefixes for all File reads and writes
# titles for tables
titletext <- "RedditCrypto"
srs = c("CryptoCurrency","CryptoMarkets")

`%notin%` <- Negate(`%in%`)

3.1 Import

3.2 Token Counts

allCount <-TknsC %>% #group words by affiliation label
    count(word, sort = TRUE) %>%
    ungroup()

3.2.1 Most Frequent Tokens

The following code show the top 25 most frequently occuring words within each subreddit.

sr_all_n25 <- allCount %>% 
  top_n(25) %>%
  ungroup()

kable(sr_all_n25)

word	n
people	18605
crypto	17983
bitcoin	13204
money	12433
time	9221
btc	9095
market	8906
buy	7798
coins	6151
coin	5784
price	5536
lot	4204
lol	4189
sell	3983
exchange	3971
currency	3949
shit	3833
day	3802
blockchain	3537
eth	3412
pay	3364
fees	3296
world	3259
yeah	3240
real	3209

3.3 Wordclouds

Wordcloud is constructed using the top 100 words.

#sr_allCount <- allCount %>% filter(!subreddit == "r/all [control]")
sr_allWC <- allCount %>%
  top_n(100) %>%
  mutate(prop = n / max(n))

set.seed(29)
ggplot(sr_allWC, aes(label = word, size = prop, color = prop)) +
    geom_text_wordcloud_area(shape = 'circle', rm_outside = TRUE) +
    scale_size_area(max_size = 30) +
    theme_minimal()

ggsave("CC_wordcloud.pdf", device = "pdf", path = "CCViz", height = 10, width = 16)

3.3.1 Wordclouds over time

TknsByDate <- TknsC %>%
  separate(created, c("created", "time"), " ") %>%
  mutate(created = ymd(created)) %>%
  mutate_at(vars(created), funs(year, month, day))

TknsByMonth <-TknsByDate %>%
    filter(year > 2015) %>%
    mutate(Month = make_date(year, month))

monthCount <- TknsByMonth %>%
    group_by(subreddit, month, year) %>% #group words by affiliation label
    count(word, sort = TRUE) %>% #count and create column 'n'
    top_n(10, n) %>%
    ungroup()

monthCount <- monthCount %>%
    group_by(subreddit, month, year) %>%
    mutate(prop = n / max(n))
    
table(monthCount$subreddit)

## 
## r/CryptoCurrency  r/CryptoMarkets 
##              396              435

ggplot(monthCount, aes(
  label = word,
  size = prop,
  color = prop
)) +
  geom_text_wordcloud_area(rm_outside = TRUE) +
  scale_size_area(max_size = 5) +
  theme_minimal() +
  facet_grid(vars(month), vars(year))

3.4 Currencies

BTC <- c("Bitcoin", "bitcoin", "BTC", "btc", "Btc")
ETH <- c("Ethereum", "ethereum", "ETH", "eth", "Eth")
XRP <- c("Ripple", "ripple", "XRP", "xrp", "Xrp")
LTC <- c("Litecoin", "litecoin", "LTC", "ltc", "Ltc")

currencies <- c(BTC, ETH, XRP, LTC)

# For frequency analysis
CurTkns <- TknsByDate %>%
  filter(word %in% currencies)

# For sentiment analysis
CurComms <- CommData %>%
  filter(comm_id %in% CurTkns$comm_id)

3.4.1 Assigning Identifiers

CurTkns <- CurTkns %>%
  mutate(Coin = case_when(
    .$word %in% BTC ~ "BTC",
    .$word %in% ETH ~ "ETH",
    .$word %in% XRP ~ "XRP",
    .$word %in% LTC ~ "LTC"
  ))

# Establish date column for grouping
curTknsByMonth <-CurTkns %>%
    mutate(Month = make_date(year, month))

curCounts <- curTknsByMonth %>%
    group_by(Coin) %>% #group words by affiliation label
    count(word, sort = TRUE) %>% #count and create column 'n'
    ungroup()

curCountsbyMonth <- curTknsByMonth %>%
    group_by(Coin, Month) %>% #group words by affiliation label
    count(word, sort = TRUE) %>% #count and create column 'n'
    ungroup()

ggplot(curCounts) +
  geom_bar(aes(x = Coin, y = n, fill = Coin), stat = "identity")

ggplot(curCountsbyMonth) +
  geom_bar(aes(x = Coin, y = n, fill = Coin), stat = "identity") +
  facet_wrap(~Month)

write.csv(curTknsByMonth, paste0(dataDir, "/CoinTknsMonthly.csv"), row.names = FALSE)