install.packages("readtext")
library(readtext)
url<- "https://raw.githubusercontent.com/jcbonilla/BusinessAnalytics/master/BAData/JobsStandfordSpeech.txt"
speech <- readtext(url)
head(speech$text)
speech$text <- gsub("'", "", speech$text) # remove apostrophes
speech$text <- gsub("[[:punct:]]", " ", speech$text) # replace punctuation with space
speech$text <- gsub("[[:cntrl:]]", " ", speech$text) # replace control characters with space
speech$text <- gsub("^[[:space:]]+", "", speech$text) # remove whitespace at beginning of documents
speech$text <- gsub("[[:space:]]+$", "", speech$text) # remove whitespace at end of documents
speech$text <- gsub("[^a-zA-Z -]", " ", speech$text) # allows only letters
speech$text <- tolower(speech$text) # force to lowercase
head(speech$text)
require (quanteda)
speechcorpus<- corpus(speech$text)
#explore the corpus
names(speechcorpus)
summary(speechcorpus) #summary of corpus
dfm.speech<- dfm(speechcorpus,
remove = stopwords("english"),
verbose=TRUE,
stem=TRUE)
topfeatures(dfm.speech, n=50)
# create a custom dictionary
list = c("s", "t","go","now","like","ever","just","even","someth","next","get","got","let"
,"ve","later","never","month","don","didn","know","put","make","thing","made","everthing"
,"turn","day","first","one","today","live","best","great","decid","start","year","can"
,"everyth","everi","way","clear")
dfm_stem<- dfm(dfm.speech,
remove = c(list,stopwords("english")),
verbose=TRUE,
stem=TRUE)
topfeatures(dfm_stem, n=50)