R 基于樸素貝葉斯模型實作手機垃圾短信過濾-有解無憂

# 讀取數資料， 查看資料結構
df_raw <- read.csv("sms_spam.csv", stringsAsFactors=F)
str(df_raw)
length(df_raw$type)

# 將資料分為特征值矩陣 X 和 類標向量y 兩部分，將 y 換為因子
X <- df_raw$text
y <- factor(df_raw$type)
length(y)

# 查看類標向量 y 的結構和組成
str(y)
table(y)

# 安裝和加載文本挖掘包
#install.packages("tm")
library(NLP)
library(tm)

# 創建語料庫
X_corpus <- VCorpus(VectorSource(X))


######## 1 清洗文本資料 ########
# 1.1 將文本中的字母轉換為小寫
X_corpus_clean <- tm_map(X_corpus, content_transformer(tolower))    

# 1.2 去除文本中的數字
X_corpus_clean <- tm_map(X_corpus_clean, removeNumbers)

# 1.3 去除文本中的停用詞
X_corpus_clean <- tm_map(X_corpus_clean, removeWords, stopwords())

# 1.4 去除文本中的標點符號
X_corpus_clean <- tm_map(X_corpus_clean, removePunctuation)

# 添加包
#install.packages("SnowballC")
library(SnowballC)
# 1.5 提取文本中每個單詞的詞干
X_corpus_clean <- tm_map(X_corpus_clean, stemDocument)

# 1.6 洗掉額外的空白
X_corpus_clean <- tm_map(X_corpus_clean, stripWhitespace)

# 1.7 將文本檔案拆分成詞語, 創建檔案——單詞矩陣 
X_dtm <- DocumentTermMatrix(X_corpus_clean)


############# 2 準備輸入資料 #############

# 2.1 劃分訓練資料集和測驗資料集
X_dtm_train <- X_dtm[1:4169, ]
X_dtm_test <- X_dtm[4170:5559, ]

y_train <- y[1:4169]
y_test <- y[4170:5559]
# 說明：因為原始資料 df_raw 是隨機選取的，所以可以直直接去前 75% 的資料為測驗資料

# 2.2 檢查樣本分分布是否偏斜
prop.table(table(y_train))
prop.table(table(y_test))

# 2.3 過濾 DTM， 選取頻繁出現的單詞
X_freq_words <- findFreqTerms(X_dtm_train, 5)   # 此處可以試錯調整，以調節模型的性能
# 過濾 DTM
X_dtm_train_freq <- X_dtm_train[, X_freq_words]
X_dtm_test_freq <- X_dtm_test[, X_freq_words]

# 2.4 將矩陣文本編碼為數值
# 2.4.1定義一個變數轉換函式
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

# 2.4.2 轉換訓練矩陣和測驗矩陣
X_train <- apply(X_dtm_train_freq, MARGIN=2, convert_counts)
X_test <- apply(X_dtm_test_freq, MARGIN=2, convert_counts)



#############  3 基于資料訓練模型 ############
# install.packages("e1071")
 library(e1071)

# 訓練模型, 拉普拉斯估計引數默認為 0
NB_classifier <- naiveBayes(X_train, y_train)


##############  4 評估模型的性能 ############# 

# 4.1 對測驗集中的樣本進行預測
y_pred <- predict(NB_classifier, X_test)

# 比較預測值和真實值
# library(gmodels)
CrossTable(x=y_test, y=y_pred,
           prop.chisq = F, prop.t = F, prop.c = F,
           dnn = c("actural", "predict"))

模型 NB_classifier 在測驗集上進行預測的混淆矩陣為：

準確率 = 0.864 + 0.110 = 0.974

對模型調參

##################  5 提高模型的性能  ##################
 
# 5.1 添加拉普拉斯估計，訓練模型
NB_classifier2 <- naiveBayes(x = X_train, y = y_train, laplace = 1)

# 5.2 對測驗集中的樣本進行預測
y_pred2 <- predict(NB_classifier2, X_test)

# 5.3 比較預測值和真實值
CrossTable(x = y_test, y = y_pred2,
           prop.chisq = F, prop.t = T, prop.c = F,
           dnn = c("actural", "predict"))

經過引數調優后的模型 NB_classifier2 在測驗集上進行預測的混淆矩陣為：

準確率 = 0.865 + 0.112 =0.977

按語：

經過拉普拉斯估計引數的調節，模型準確率有 0.974 提高至 0.977，在高準確的前提下能有提升，實屬不易，

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/121146.html

標籤：其他

上一篇：5-3 自然數的拆分 (20 分)

下一篇：CodeForces - 1228C（質因數分解+貢獻法）