sureshgorakala · neilchalk · Aug 1, 2017 · Nov 2, 2017 · Nov 2, 2017 · Nov 2, 2017
diff --git a/Cluster Analysis → Cluster Analysis.R b/Cluster Analysis → Cluster Analysis.R
diff --git a/Crawl twitter b/Crawl twitter
diff --git a/Movie Review Engine - R → Movie Review Engine.R b/Movie Review Engine - R → Movie Review Engine.R
diff --git a/R Text classfication using CSV files → R Text classfication using CSV files.R b/R Text classfication using CSV files → R Text classfication using CSV files.R
diff --git a/README.md b/README.md
@@ -2,3 +2,44 @@ Text-Classification---R
 =======================
 
 Classifying documents into categories
+
+[Document Classification using R](http://www.dataperspective.info/2013/07/document-classification-using-r.htmlhttp://www.dataperspective.info/2013/07/document-classification-using-r.html)
+
+The file "Topic Modelling" is based on the blog post [Topic Modeling in R](http://www.bigdatanews.datasciencecentral.com/profiles/blogs/topic-modeling-in-r).
+
+BEfore running on Windows I had some setup to do:
+
+* Install R from https://cran.r-project.org/bin/windows/base/
+* Install RStudio from https://www.rstudio.com/products/rstudio/download/
+* Create a Twitter App to collect the data at https://apps.twitter.com/app/new - more details [Twitter Analytics Using R Part 1: Extract Tweets](https://www.credera.com/blog/business-intelligence/twitter-analytics-using-r-part-1-extract-tweets/)
+
+the values from the Twitter app need to go into these lines in "Topic Modelling":
+
+```R
+Consumer_key<- "YOUR_CONSUMER_KEY"
+Consumer_secret <- "YOUR_CONSUMER_SECRET"
+access_token <- "YOUR_ACCESS_TOKEN"
+access_token_secret <- "YOUR_TOKEN_SECRET"
+```
+
+Then in the console make sure all the packages required are installed (this is in the script "prereqs.R"):
+
+```R
+install.packages('twitteR')
+install.packages("tm")
+install.packages("wordcloud")
+install.packages("slam")
+install.packages("topicmodels")
+install.packages('base64enc')
+```
+Now you are ready to run "Topic Modelling" in RStudio! There are two lines near the head of the file that can be used to tweak the number and type of of results:
+
+```R
+numTweets <- 900
+
+tweetData <- searchTwitter("flight", n=numTweets, lang="en")
+```
+
+numTweets is used in the twitter search and later in to set the SEED for the analysis algorithm.
+
+tweetData holds the data that we want to analyse. Here I have done a simple search for the text "flight" in English. See [Getting Data via twitteR](https://sites.google.com/site/miningtwitter/basics/getting-data/by-twitter) for more ideas. Look in "topic-exploration.R" and "Sentiment.R" for more ways to look at the data downloaded in "Topic Modelling.R"
diff --git a/Sentiment.R b/Sentiment.R
@@ -0,0 +1,81 @@
+#include required libraries
+library(plyr)
+library(stringr)
+library(readr)
+# download https://github.com/uwescience/datasci_course_materials/blob/master/assignment1/AFINN-111.txt
+# and put in your working directory this contains the list of words for sentiment analysis later on
+
+# This is a follow on from "Topic Modelling.R" and takes a different look at the data
+
+#get the tweets
+dataset <- read_csv("tweets.csv")
+tweets_txt <- dataset$text
+
+
+#function to clean data
+cleanTweets = function(tweets)
+{
+  tweets_cl = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweets)
+  tweets_cl = gsub("http[^[:blank:]]+", "", tweets_cl)
+  tweets_cl = gsub("@\\w+", "", tweets_cl)
+  tweets_cl = gsub("[ \t]{2,}", "", tweets_cl)
+  tweets_cl = gsub("^\\s+|\\s+$", "", tweets_cl)
+  tweets_cl = gsub("[[:punct:]]", " ", tweets_cl)
+  tweets_cl = gsub("[^[:alnum:]]", " ", tweets_cl)
+  tweets_cl <- gsub('\\d+', '', tweets_cl)
+  return(tweets_cl)
+}
+
+#function to calculate number of words in each category within a sentence
+sentimentScore <- function(sentences, vNegTerms, negTerms, posTerms, vPosTerms){
+  final_scores <- matrix('', 0, 5)
+  scores <- laply(sentences, function(sentence, vNegTerms, negTerms, posTerms, vPosTerms){
+    initial_sentence <- sentence
+    #remove unnecessary characters and split up by word
+    sentence = cleanTweets(sentence)
+    sentence <- tolower(sentence)
+    wordList <- str_split(sentence, '\\s+')
+    words <- unlist(wordList)
+    #build vector with matches between sentence and each category
+    vPosMatches <- match(words, vPosTerms)
+    posMatches <- match(words, posTerms)
+    vNegMatches <- match(words, vNegTerms)
+    negMatches <- match(words, negTerms)
+    #sum up number of words in each category
+    vPosMatches <- sum(!is.na(vPosMatches))
+    posMatches <- sum(!is.na(posMatches))
+    vNegMatches <- sum(!is.na(vNegMatches))
+    negMatches <- sum(!is.na(negMatches))
+    score <- c(vNegMatches, negMatches, posMatches, vPosMatches)
+    #add row to scores table
+    newrow <- c(initial_sentence, score)
+    final_scores <- rbind(final_scores, newrow)
+    return(final_scores)
+  }, vNegTerms, negTerms, posTerms, vPosTerms)
+  return(scores)
+}
+
+
+#load pos,neg statements
+afinn_list <- read.delim(file='AFINN-111.txt', header=FALSE, stringsAsFactors=FALSE)
+names(afinn_list) <- c('word', 'score')
+afinn_list$word <- tolower(afinn_list$word) 
+
+#categorize words as very negative to very positive and add some movie-specific words
+vNegTerms <- afinn_list$word[afinn_list$score==-5 | afinn_list$score==-4]
+negTerms <- afinn_list$word[afinn_list$score==-3 | afinn_list$score==-2 | afinn_list$score==-1]
+posTerms <- afinn_list$word[afinn_list$score==3 | afinn_list$score==2 | afinn_list$score==1]
+vPosTerms <- afinn_list$word[afinn_list$score==5 | afinn_list$score==4]    
+
+#Calculate score on each tweet
+tweetResult <- as.data.frame(sentimentScore(tweets_txt, vNegTerms, negTerms, posTerms, vPosTerms))
+tweetResult$'2' = as.numeric(tweetResult$'2')
+tweetResult$'3' = as.numeric(tweetResult$'3')
+tweetResult$'4' = as.numeric(tweetResult$'4')
+tweetResult$'5' = as.numeric(tweetResult$'5')
+counts = c(sum(tweetResult$'2'),sum(tweetResult$'3'),sum(tweetResult$'4'),sum(tweetResult$'5'))
+names = c("Worst","BAD","GOOD","VERY GOOD")
+mr = list(counts,names)
+colors = c("red", "yellow", "green", "violet")
+barplot(mr[[1]], main="Simple Twitter Sentiment Analysis", xlab="Number of votes",legend=mr[[2]],col=colors)
+
diff --git a/Topic Modelling b/Topic Modelling
diff --git a/Topic Modelling.R b/Topic Modelling.R
@@ -0,0 +1,58 @@
+library("twitteR")
+library("tm")
+library("wordcloud")
+library("slam")
+library("topicmodels")
+library("readr")
+
+#Connect to Twitter
+# For guidance on creating an app see 
+#  https://www.credera.com/blog/business-intelligence/twitter-analytics-using-r-part-1-extract-tweets/
+Consumer_key<- "YOUR_CONSUMER_KEY"
+Consumer_secret <- "YOUR_CONSUMER_SECRET"
+access_token <- "YOUR_ACCESS_TOKEN"
+access_token_secret <- "YOUR_TOKEN_SECRET"
+setup_twitter_oauth(Consumer_key,Consumer_secret,access_token,access_token_secret)
+
+numTweets <- 900
+
+#get data set and save for later, e.g. to investigate anomalies 
+tweetData <- searchTwitter("flight", n=numTweets, lang="en")
+write.csv(twListToDF(tweetData), file="tweets.csv")
+
+#Load Text
+dataset <- read_csv("tweets.csv")
+tweets <- dataset$text
+#
+
+#Clean Text
+tweets = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweets)
+tweets = gsub("http[^[:blank:]]+", "", tweets)
+tweets = gsub("@\\w+", "", tweets)
+tweets = gsub("[ \t]{2,}", "", tweets)
+tweets = gsub("^\\s+|\\s+$", "", tweets)
+tweets <- gsub('\\d+', '', tweets)
+tweets = gsub("[[:punct:]]", " ", tweets)
+
+corpus = Corpus(VectorSource(tweets))
+corpus = tm_map(corpus,removePunctuation)
+corpus = tm_map(corpus,stripWhitespace)
+corpus = tm_map(corpus,tolower)
+corpus = tm_map(corpus,removeWords,stopwords("english"))
+#remove the Twitter related metadata
+corpus = tm_map(corpus,removeWords, c("iphone","android","web", "tweetdeck", "ifttt"))
+tdm = DocumentTermMatrix(corpus) # Creating a Term document Matrix
+
+# create tf-idf matrix
+# see http://tidytextmining.com/tfidf.html for the theory
+term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm > 0))
+summary(term_tfidf)
+tdm <- tdm[,term_tfidf >= 0.1]
+tdm <- tdm[row_sums(tdm) > 0,]
+
+#Perform the topic modelling and save summary
+#Further info on LDA here http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/
+lda <- LDA(tdm, k = 8) # find 8 topics
+Terms <- terms(lda, 10) # first 4 terms of every topic
+Terms
+write.csv(Terms, file = "Terms.csv")
diff --git a/prereqs.R b/prereqs.R
@@ -0,0 +1,7 @@
+install.packages('twitteR')
+install.packages("tm")
+install.packages("wordcloud")
+install.packages("slam")
+install.packages("topicmodels")
+install.packages('base64enc')
+install.packages('readr')
diff --git a/topic-exploration.R b/topic-exploration.R
@@ -0,0 +1,31 @@
+#WIP with further exploration of data from "Topic Modelling"
+
+#Visualisation 1 - Frequency of terms
+library(ggplot2)
+term.freq <- rowSums(as.matrix(tdm))
+term.freq <- subset(term.freq, term.freq >=5)
+df <- data.frame(term = names(term.freq), freq = term.freq)
+ggplot(df, aes(x=term, y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") +coord_flip()
+
+#Visualisation 2 - explore associsations of terms found
+findAssocs(tdm, "delta", 0.25)
+findAssocs(tdm, "JFK", 0.25)
+findAssocs(tdm, "delay", 0.25)
+
+#Visualisation 3 - Create a WorldCloud
+library(wordcloud)
+m <- as.matrix(tdm$dimnames$Terms)
+# calculate the frequency of words and sort it by frequency
+word.freq <- sort(rowSums(m), decreasing = T)
+wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
+          random.order = F)
+
+#Visualisation 4 - Cluster tree diagram
+# remove sparse terms
+tdm2 <- removeSparseTerms(tdm, sparse = 0.97)
+m2 <- as.matrix(tdm2)
+# cluster terms
+distMatrix <- dist(scale(m2))
+fit <- hclust(distMatrix, method = "ward.D")
+plot(fit)
+rect.hclust(fit, k = 6) # cut tree into 6 clusters