diff --git a/Cluster Analysis b/Cluster Analysis.R similarity index 100% rename from Cluster Analysis rename to Cluster Analysis.R diff --git a/Crawl twitter b/Crawl twitter deleted file mode 100644 index 83ca007..0000000 --- a/Crawl twitter +++ /dev/null @@ -1,16 +0,0 @@ -requestURL <- "https://api.twitter.com/oauth/request_token" -accessURL = "https://api.twitter.com/oauth/access_token" -authURL = "https://api.twitter.com/oauth/authorize" -consumerKey = "XXXXXXXXXXXX" -consumerSecret = "XXXXXXXXXXXXXXXX" -twitCred <- OAuthFactory$new(consumerKey=consumerKey, - consumerSecret=consumerSecret, - requestURL=requestURL, - accessURL=accessURL, - authURL=authURL) -download.file(url="http://curl.haxx.se/ca/cacert.pem", - destfile="cacert.pem") -twitCred$handshake(cainfo="cacert.pem") -save(list="twitCred", file="twitteR_credentials") -load("twitteR_credentials") -registerTwitterOAuth(twitCred)#Register your app with Twitter. diff --git a/Movie Review Engine - R b/Movie Review Engine.R similarity index 100% rename from Movie Review Engine - R rename to Movie Review Engine.R diff --git a/R Text classfication using CSV files b/R Text classfication using CSV files.R similarity index 100% rename from R Text classfication using CSV files rename to R Text classfication using CSV files.R diff --git a/README.md b/README.md index 618962b..178e913 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,44 @@ Text-Classification---R ======================= Classifying documents into categories + +[Document Classification using R](http://www.dataperspective.info/2013/07/document-classification-using-r.htmlhttp://www.dataperspective.info/2013/07/document-classification-using-r.html) + +The file "Topic Modelling" is based on the blog post [Topic Modeling in R](http://www.bigdatanews.datasciencecentral.com/profiles/blogs/topic-modeling-in-r). + +BEfore running on Windows I had some setup to do: + +* Install R from https://cran.r-project.org/bin/windows/base/ +* Install RStudio from https://www.rstudio.com/products/rstudio/download/ +* Create a Twitter App to collect the data at https://apps.twitter.com/app/new - more details [Twitter Analytics Using R Part 1: Extract Tweets](https://www.credera.com/blog/business-intelligence/twitter-analytics-using-r-part-1-extract-tweets/) + +the values from the Twitter app need to go into these lines in "Topic Modelling": + +```R +Consumer_key<- "YOUR_CONSUMER_KEY" +Consumer_secret <- "YOUR_CONSUMER_SECRET" +access_token <- "YOUR_ACCESS_TOKEN" +access_token_secret <- "YOUR_TOKEN_SECRET" +``` + +Then in the console make sure all the packages required are installed (this is in the script "prereqs.R"): + +```R +install.packages('twitteR') +install.packages("tm") +install.packages("wordcloud") +install.packages("slam") +install.packages("topicmodels") +install.packages('base64enc') +``` +Now you are ready to run "Topic Modelling" in RStudio! There are two lines near the head of the file that can be used to tweak the number and type of of results: + +```R +numTweets <- 900 + +tweetData <- searchTwitter("flight", n=numTweets, lang="en") +``` + +numTweets is used in the twitter search and later in to set the SEED for the analysis algorithm. + +tweetData holds the data that we want to analyse. Here I have done a simple search for the text "flight" in English. See [Getting Data via twitteR](https://sites.google.com/site/miningtwitter/basics/getting-data/by-twitter) for more ideas. Look in "topic-exploration.R" and "Sentiment.R" for more ways to look at the data downloaded in "Topic Modelling.R" diff --git a/Sentiment.R b/Sentiment.R new file mode 100644 index 0000000..9e60d49 --- /dev/null +++ b/Sentiment.R @@ -0,0 +1,81 @@ +#include required libraries +library(plyr) +library(stringr) +library(readr) +# download https://github.com/uwescience/datasci_course_materials/blob/master/assignment1/AFINN-111.txt +# and put in your working directory this contains the list of words for sentiment analysis later on + +# This is a follow on from "Topic Modelling.R" and takes a different look at the data + +#get the tweets +dataset <- read_csv("tweets.csv") +tweets_txt <- dataset$text + + +#function to clean data +cleanTweets = function(tweets) +{ + tweets_cl = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweets) + tweets_cl = gsub("http[^[:blank:]]+", "", tweets_cl) + tweets_cl = gsub("@\\w+", "", tweets_cl) + tweets_cl = gsub("[ \t]{2,}", "", tweets_cl) + tweets_cl = gsub("^\\s+|\\s+$", "", tweets_cl) + tweets_cl = gsub("[[:punct:]]", " ", tweets_cl) + tweets_cl = gsub("[^[:alnum:]]", " ", tweets_cl) + tweets_cl <- gsub('\\d+', '', tweets_cl) + return(tweets_cl) +} + +#function to calculate number of words in each category within a sentence +sentimentScore <- function(sentences, vNegTerms, negTerms, posTerms, vPosTerms){ + final_scores <- matrix('', 0, 5) + scores <- laply(sentences, function(sentence, vNegTerms, negTerms, posTerms, vPosTerms){ + initial_sentence <- sentence + #remove unnecessary characters and split up by word + sentence = cleanTweets(sentence) + sentence <- tolower(sentence) + wordList <- str_split(sentence, '\\s+') + words <- unlist(wordList) + #build vector with matches between sentence and each category + vPosMatches <- match(words, vPosTerms) + posMatches <- match(words, posTerms) + vNegMatches <- match(words, vNegTerms) + negMatches <- match(words, negTerms) + #sum up number of words in each category + vPosMatches <- sum(!is.na(vPosMatches)) + posMatches <- sum(!is.na(posMatches)) + vNegMatches <- sum(!is.na(vNegMatches)) + negMatches <- sum(!is.na(negMatches)) + score <- c(vNegMatches, negMatches, posMatches, vPosMatches) + #add row to scores table + newrow <- c(initial_sentence, score) + final_scores <- rbind(final_scores, newrow) + return(final_scores) + }, vNegTerms, negTerms, posTerms, vPosTerms) + return(scores) +} + + +#load pos,neg statements +afinn_list <- read.delim(file='AFINN-111.txt', header=FALSE, stringsAsFactors=FALSE) +names(afinn_list) <- c('word', 'score') +afinn_list$word <- tolower(afinn_list$word) + +#categorize words as very negative to very positive and add some movie-specific words +vNegTerms <- afinn_list$word[afinn_list$score==-5 | afinn_list$score==-4] +negTerms <- afinn_list$word[afinn_list$score==-3 | afinn_list$score==-2 | afinn_list$score==-1] +posTerms <- afinn_list$word[afinn_list$score==3 | afinn_list$score==2 | afinn_list$score==1] +vPosTerms <- afinn_list$word[afinn_list$score==5 | afinn_list$score==4] + +#Calculate score on each tweet +tweetResult <- as.data.frame(sentimentScore(tweets_txt, vNegTerms, negTerms, posTerms, vPosTerms)) +tweetResult$'2' = as.numeric(tweetResult$'2') +tweetResult$'3' = as.numeric(tweetResult$'3') +tweetResult$'4' = as.numeric(tweetResult$'4') +tweetResult$'5' = as.numeric(tweetResult$'5') +counts = c(sum(tweetResult$'2'),sum(tweetResult$'3'),sum(tweetResult$'4'),sum(tweetResult$'5')) +names = c("Worst","BAD","GOOD","VERY GOOD") +mr = list(counts,names) +colors = c("red", "yellow", "green", "violet") +barplot(mr[[1]], main="Simple Twitter Sentiment Analysis", xlab="Number of votes",legend=mr[[2]],col=colors) + diff --git a/Topic Modelling b/Topic Modelling deleted file mode 100644 index 67448b5..0000000 --- a/Topic Modelling +++ /dev/null @@ -1,47 +0,0 @@ -library("tm") -library("wordcloud") -library("slam") -library("topicmodels") - -#Load Text -con <- file("tweets.txt", "rt") -tweets = readLines(con) - -#Clean Text -tweets = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweets) -tweets = gsub("http[^[:blank:]]+", "", tweets) -tweets = gsub("@\\w+", "", tweets) -tweets = gsub("[ \t]{2,}", "", tweets) -tweets = gsub("^\\s+|\\s+$", "", tweets) -tweets <- gsub('\\d+', '', tweets) -tweets = gsub("[[:punct:]]", " ", tweets) - -corpus = Corpus(VectorSource(tweets)) -corpus = tm_map(corpus,removePunctuation) -corpus = tm_map(corpus,stripWhitespace) -corpus = tm_map(corpus,tolower) -corpus = tm_map(corpus,removeWords,stopwords("english")) -tdm = DocumentTermMatrix(corpus) # Creating a Term document Matrix - -# create tf-idf matrix -term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm > 0)) -summary(term_tfidf) -tdm <- tdm[,term_tfidf >= 0.1] -tdm <- tdm[row_sums(tdm) > 0,] -summary(col_sums(tdm)) - -#Deciding best K value using Log-likelihood method -best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(tdm, d)}) -best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik))) - -#calculating LDA -k = 50;#number of topics -SEED = 786; # number of tweets used -CSC_TM <-list(VEM = LDA(tdm, k = k, control = list(seed = SEED)),VEM_fixed = LDA(tdm, k = k,control = list(estimate.alpha = FALSE, seed = SEED)),Gibbs = LDA(tdm, k = k, method = "Gibbs",control = list(seed = SEED, burnin = 1000,thin = 100, iter = 1000)),CTM = CTM(tdm, k = k,control = list(seed = SEED,var = list(tol = 10^-4), em = list(tol = 10^-3)))) - -#To compare the fitted models we first investigate the values of the models fitted with VEM and estimated and with VEM and fixed -sapply(CSC_TM[1:2], slot, "alpha") -sapply(CSC_TM, function(x) mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z))))) -Topic <- topics(CSC_TM[["VEM"]], 1) -Terms <- terms(CSC_TM[["VEM"]], 8) -Terms diff --git a/Topic Modelling.R b/Topic Modelling.R new file mode 100644 index 0000000..a0eccbd --- /dev/null +++ b/Topic Modelling.R @@ -0,0 +1,58 @@ +library("twitteR") +library("tm") +library("wordcloud") +library("slam") +library("topicmodels") +library("readr") + +#Connect to Twitter +# For guidance on creating an app see +# https://www.credera.com/blog/business-intelligence/twitter-analytics-using-r-part-1-extract-tweets/ +Consumer_key<- "YOUR_CONSUMER_KEY" +Consumer_secret <- "YOUR_CONSUMER_SECRET" +access_token <- "YOUR_ACCESS_TOKEN" +access_token_secret <- "YOUR_TOKEN_SECRET" +setup_twitter_oauth(Consumer_key,Consumer_secret,access_token,access_token_secret) + +numTweets <- 900 + +#get data set and save for later, e.g. to investigate anomalies +tweetData <- searchTwitter("flight", n=numTweets, lang="en") +write.csv(twListToDF(tweetData), file="tweets.csv") + +#Load Text +dataset <- read_csv("tweets.csv") +tweets <- dataset$text +# + +#Clean Text +tweets = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweets) +tweets = gsub("http[^[:blank:]]+", "", tweets) +tweets = gsub("@\\w+", "", tweets) +tweets = gsub("[ \t]{2,}", "", tweets) +tweets = gsub("^\\s+|\\s+$", "", tweets) +tweets <- gsub('\\d+', '', tweets) +tweets = gsub("[[:punct:]]", " ", tweets) + +corpus = Corpus(VectorSource(tweets)) +corpus = tm_map(corpus,removePunctuation) +corpus = tm_map(corpus,stripWhitespace) +corpus = tm_map(corpus,tolower) +corpus = tm_map(corpus,removeWords,stopwords("english")) +#remove the Twitter related metadata +corpus = tm_map(corpus,removeWords, c("iphone","android","web", "tweetdeck", "ifttt")) +tdm = DocumentTermMatrix(corpus) # Creating a Term document Matrix + +# create tf-idf matrix +# see http://tidytextmining.com/tfidf.html for the theory +term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm > 0)) +summary(term_tfidf) +tdm <- tdm[,term_tfidf >= 0.1] +tdm <- tdm[row_sums(tdm) > 0,] + +#Perform the topic modelling and save summary +#Further info on LDA here http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/ +lda <- LDA(tdm, k = 8) # find 8 topics +Terms <- terms(lda, 10) # first 4 terms of every topic +Terms +write.csv(Terms, file = "Terms.csv") diff --git a/prereqs.R b/prereqs.R new file mode 100644 index 0000000..49afc0f --- /dev/null +++ b/prereqs.R @@ -0,0 +1,7 @@ +install.packages('twitteR') +install.packages("tm") +install.packages("wordcloud") +install.packages("slam") +install.packages("topicmodels") +install.packages('base64enc') +install.packages('readr') diff --git a/topic-exploration.R b/topic-exploration.R new file mode 100644 index 0000000..c73e32a --- /dev/null +++ b/topic-exploration.R @@ -0,0 +1,31 @@ +#WIP with further exploration of data from "Topic Modelling" + +#Visualisation 1 - Frequency of terms +library(ggplot2) +term.freq <- rowSums(as.matrix(tdm)) +term.freq <- subset(term.freq, term.freq >=5) +df <- data.frame(term = names(term.freq), freq = term.freq) +ggplot(df, aes(x=term, y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") +coord_flip() + +#Visualisation 2 - explore associsations of terms found +findAssocs(tdm, "delta", 0.25) +findAssocs(tdm, "JFK", 0.25) +findAssocs(tdm, "delay", 0.25) + +#Visualisation 3 - Create a WorldCloud +library(wordcloud) +m <- as.matrix(tdm$dimnames$Terms) +# calculate the frequency of words and sort it by frequency +word.freq <- sort(rowSums(m), decreasing = T) +wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3, + random.order = F) + +#Visualisation 4 - Cluster tree diagram +# remove sparse terms +tdm2 <- removeSparseTerms(tdm, sparse = 0.97) +m2 <- as.matrix(tdm2) +# cluster terms +distMatrix <- dist(scale(m2)) +fit <- hclust(distMatrix, method = "ward.D") +plot(fit) +rect.hclust(fit, k = 6) # cut tree into 6 clusters