My First Data Science Project with R

“My First Data Science Project with R”

Sharing by Chainarong Kesamoon, Data Science Team

meetup_R

  1. โหลด dataset
## Load dataset
setwd("/Users/day/Desktop/Data\ Science\ Thailand")
amazon_cafe<-read.csv("amazon_cafe.csv",stringsAsFactors = F,header = T)
amazon_cafe<-amazon_cafe$amazon_cafe_text
  1. ใช้ฟังก์ชัน gsub() เพื่อลบเครื่องหมายต่างๆออก
# combining text together
amazon_cafe<-paste(amazon_cafe,collapse = " ")
amazon_cafe<-gsub("[.]+[ ]","",amazon_cafe)  # remove ". "
amazon_cafe<-gsub("[:]","",amazon_cafe)      # remove ":"
amazon_cafe<-gsub("[!]","",amazon_cafe)      # remove "!"
amazon_cafe<-gsub("[?]","",amazon_cafe)      # remove "?"
amazon_cafe<-gsub("[;]","",amazon_cafe)      # remove ";"
amazon_cafe<-gsub("[,]+[ ]","",amazon_cafe)  # remove ", "
  1. โหลด library tm เพื่อใช้เครื่องมือช่วยในการนับคำ
library(tm)
## setting up source and corpus
amazonCafeAppSource<-VectorSource(amazon_cafe)
corpusCafe<-Corpus(amazonCafeAppSource)
## bulid document-term matrix
dtmCafe<-DocumentTermMatrix(corpusCafe)
dtmCafe<-as.matrix(dtmCafe)
  1. library dplyr ใช้ช่วยในการจัดการตารางข้อมูล
library(dplyr)
table_cafe<-tbl_df(data.frame(word=colnames(dtmCafe),count=colSums(dtmCafe)))
table_cafe<- filter(table_cafe,nchar(as.character(word))>3)
table_cafe<-mutate(table_cafe,plusOne=count+1) 
table_cafe<-mutate(table_cafe,probability=plusOne/sum(plusOne))
table_cafe<-mutate(table_cafe,log.Probability=log(probability))
table_cafe<-arrange(table_cafe,desc(count))

ซึ่งจะทำให้ได้ตารางความน่าจะเป็นของคลาส cafe ดังนี้

table_cafe
## Source: local data frame [564 x 5]
## 
##                     word count plusOne probability log.Probability
##                   (fctr) (dbl)   (dbl)       (dbl)           (dbl)
## 1                  wells    66      67  0.02853492       -3.556627
## 2                emerald    62      63  0.02683135       -3.618184
## 3                   cafe    43      44  0.01873935       -3.977130
## 4                   &amp    40      41  0.01746167       -4.047747
## 5                 amazon    40      41  0.01746167       -4.047747
## 6                blossom    33      34  0.01448041       -4.234959
## 7                   pear    33      34  0.01448041       -4.234959
## 8            @mckbirdbks    32      33  0.01405451       -4.264812
## 9                   lane    32      33  0.01405451       -4.264812
## 10 http//t.co/0t3myyiwbj    26      27  0.01149915       -4.465482
## ..                   ...   ...     ...         ...             ...

สำหรับตารางความน่าจะเป็นของคลาส other ก็ทำได้ในลักษณะเดียวกัน ตั้งชื่อว่า table_other จะมีลักษณะดังนี้

table_other
## Source: local data frame [1,257 x 5]
## 
##        word count plusOne probability log.Probability
##      (fctr) (dbl)   (dbl)       (dbl)           (dbl)
## 1    amazon    45      46 0.013710879       -4.289566
## 2    #asmsg    29      30 0.008941878       -4.717010
## 3      &amp    28      29 0.008643815       -4.750911
## 4      from    25      26 0.007749627       -4.860111
## 5     plane    21      22 0.006557377       -5.027165
## 6   mission    20      21 0.006259314       -5.073685
## 7  prepares    20      21 0.006259314       -5.073685
## 8     solar    20      21 0.006259314       -5.073685
## 9     #pdf1    15      16 0.004769001       -5.345618
## 10   #iartg    14      15 0.004470939       -5.410157
## ..      ...   ...     ...         ...             ...
  1. โหลดข้อมูลสำหรับทดสอบ Naive Bayes Prediction
## load test set
amazon_test<-read.csv("amazon_test.txt",stringsAsFactors = F,sep="\n",header = F)
## Warning in read.table(file = file, header = header, sep = sep,
## quote = quote, : incomplete final line found by readTableHeader on
## 'amazon_test.txt'
amazon_test<-amazon_test$V1
## prepare prediction vector
m<-length(amazon_test)
predict<-vector(mode = "logical",m)

## Naive Bayes prediction for each instance
for(i in 1:m){
  ## cleansing data at each tweet
  amazon_test[i]<-tolower(amazon_test[i])
  amazon_test[i]<-gsub("[.]+[ ]","",amazon_test[i])
  amazon_test[i]<-gsub("[:]","",amazon_test[i])
  amazon_test[i]<-gsub("[!]","",amazon_test[i])
  amazon_test[i]<-gsub("[?]","",amazon_test[i])
  amazon_test[i]<-gsub("[;]","",amazon_test[i])
  amazon_test[i]<-gsub("[,]","",amazon_test[i])
  
  ## split tweet as a vector of words
  test_vector<-strsplit(amazon_test[i]," ",fixed=TRUE)[[1]]
  ## prepare probability vectors
  k<-length(test_vector)
  p_cafe_vector<-numeric(k)
  p_other_vector<-numeric(k)
  ## match each word in the tweet to its conditional probability
  for(j in 1:k){
    if(nchar(test_vector[j])<3) next  # assign zero probability to word shorter than 3 characters
    ## assign conditional log-probability to each word in the vector
    if(!test_vector[j] %in% as.character(table_cafe$word)){p_cafe_vector[j]<-log(1/sum(table_cafe$count))}else{ p_cafe_vector[j]<-filter(table_cafe,word==test_vector[j])$log.Probability} 
    if(!test_vector[j] %in% as.character(table_other$word)){p_other_vector[j]<-log(1/sum(table_other$count))} else {
      p_other_vector[j]<-filter(table_other,word==test_vector[j])$log.Probability}
  }
  ## sum up log-probability for each tweet
  p_cafe<-sum(p_cafe_vector)     # conditional probability for class cafe
  p_other<-sum(p_other_vector)   # conditional probability for class other
  predict[i]<-ifelse(p_cafe>p_other,"CAFE","OTHER")
}
predict   # view the prediction
## [1] "OTHER" "CAFE"  "CAFE"  "CAFE"
  1. เราสามารถใช้ library wordcloud สำหรับการ visualization ได้
# visualization
library(wordcloud)
cloud<-filter(table_cafe,nchar(as.character(word))<=10)
wordcloud(cloud$word[1:100],cloud$count[1:100],colors = brewer.pal(5,"Spectral"))

1 Comments

  1. Kritsapat

    Reply

    รบกวนสอบถาม พอจะมีตัวอย่างในการทำ ประโยค ภาษาไทย ไหมครับ

Leave Comment

Your email address will not be published. Required fields are marked *