URL <- "https://raw.githubusercontent.com/DS4PS/cpp-527-fall-2020/master/labs/data/medium-data-utf8-v2.csv"
d <- read.csv( URL )
run_regression <- function( x, y )
{
x2 <- x*x
y <- log(y)
m <- lm( y ~ x + x2 )
summary(m)
return(m)
}
x <- 1:100
y <- 10 + 2*x + rnorm(100,0,25)
m <- lm( y ~ x )
summary(m)
m2 <- run_regression( x, y )
summary( m2 )
run_regression <- function( x, y )
{
x2 <- x*x
y <- log(y)
m <- lm( y ~ x + x2 )
print( summary(m) )
return( m )
}
x <- 1:100
y <- 10 + 2*x + rnorm(100,0,25)
m <- lm( y ~ x )
summary(m)
run_regression <- function( x, y )
{
x2 <- x*x
y <- log(y)
m <- lm( y ~ x + x2 )
print( summary(m) )
return( m )
}
m2 <- run_regression( x, y )
summary( m )
summary( m2 )
summary( y )
lm
run_regression <- function( x )
{
x2 <- x*x
y <- log(y)
m <- lm( y ~ x + x2 )
print( summary(m) )
return( m )
}
m2 <- run_regression( x )
ls()
summary(m)
x <- 1:100
y <- 10 + 2*x + rnorm(100,0,25)
m <- lm( y ~ x )
summary(m)
run_regression <- function( x, y )
{
x2 <- x*x
y <- log(y)
m <<- lm( y ~ x + x2 )
print( summary(m) )
return( m )
}
m2 <- run_regression( x=x, y=y )
summary( m2 )
summary( m )
URL <- "https://raw.githubusercontent.com/DS4PS/cpp-527-fall-2020/master/labs/data/medium-data-utf8-v2.csv"
d <- read.csv( URL )
# replace weird spaces with regular spaces
d$title <- gsub( " ", " ", d$title )
d$title <- gsub( "\\s", " ", d$title )
# note the use of single-quote marks since double-quotes appear in the text
d$title <- gsub( '', "", d$title )
d$title <- gsub( '', "", d$title )
# must use double-escape in front of the plus sign
# since it is an operator in reg-ex
# —
d$title <- gsub( "—", "", d$title )
titles <- tolower( d$title ) # convert to lower case
titles <- gsub( "[0-9]", "", titles ) # remove numbers
head( titles )
words <- strsplit( titles, " " )
head( words )
class( words )
head( words )
library( dplyr )
head( words ) %>% unlist()
head( words )
words[1]
(words[[1]])[1]
(words[[1]])[2]
one.sentence <- words[[1]]
one.sentence
class( words )
class( one.sentence )
one.sentence
one.sentence[1]
length( one.sentence )
one.sentence[10]
head( words )
one.sentence[ length(one.sentence) ]
one.sentence <- words[[3]]
one.sentence[ length(one.sentence) ]
one.sentence
head( words )
length( titles )
head( titles )
head( words )
length( titles ) == length( words )
length( titles )
length( words )
results <- NULL
for( i in 1:length(words) )
{
# extract vector from list position i
one.sentence <- words[[i]]
# analysis with one sentence at a time
num.words <- length( one.sentence )
# save results
results[i] <- num.words
}
head( results )
head( words )
results <- NULL
for( i in 1:length(words) )
{
# extract vector from list position i
one.sentence <- words[[i]]
# analysis with one sentence at a time
first.word <- one.sentence[1]
# save results
results[i] <- first.word
}
head( results )
table( results ) %>% sort()
head( words )
results <- NULL
for( i in 1:length(words) )
{
# extract vector from list position i
one.sentence <- words[[i]]
# analysis with one sentence at a time
num.words <- length( one.sentence )
# save results
results[i] <- num.words
}
length( results )
head( words )
results <- lapply( words, length )
head( results )
length( results )
lenth( titles )
length( titles )
results <- unlist( results )
head( results )
hist( results )
results <- sapply( words, length )
head( results )
class( results )
words[[1]]
head( words )
get_first_name <- function( x )
{
first.word <- x[1]
return( first.word )
}
one.sentence <- words[[1]]
get_first_name( one.sentence )
one.sentence <- words[[2]]
get_first_name( one.sentence )
head( words )
results <- sapply( words, get_first_name )
head( results )
get_last_name <- function( x )
{
last.word <- x[ length(x) ]
return( last.word )
}
get_first_word <- function( x )
{
first.word <- x[1]
return( first.word )
}
get_last_word <- function( x )
{
last.word <- x[ length(x) ]
return( last.word )
}
results <- sapply( words, get_last_word )
get_last_word( one.sentence )
one.sentence[-1]
one.sentence