在用sparkstreaming和jieba一起實作中文詞頻統計時,用以下代碼出現了jieba模塊找不到的錯誤,可是我明明已經import成功了,在不使用sparkstreaming的情況下jieba是能用的,有大佬能幫幫忙嗎,作業今晚就要交了#(淚)#(淚)#(淚)
from pyspark.context import SparkContext
import jieba
# from pyspark.sql.session import SparkSession
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import StringIndexer, VectorIndexer
sc = SparkContext("local", "WordCount") #初始化配置
data = sc.textFile(r"D:\WordCount.txt") #讀取是utf-8編碼的檔案
with open(r'd:\中文停用詞庫.txt','r',encoding='utf-8') as f:
x=f.readlines()
stop=[i.replace('\n','') for i in x]
stop.extend([',','的','我','他','','。',' ','\n','?',';',':','-','(',')','!','1909','1920','325','B612','II','III','IV','V','VI','—','‘','’','“','”','…','、'])#停用標點之類
data=https://bbs.csdn.net/topics/data.flatMap(lambda line: jieba.cut(line,cut_all=False)).filter(lambda w: w not in stop)./
map(lambda w:(w,1)).reduceByKey(lambda w0,w1:w0+w1).sortBy(lambda x:x[1],ascending=False)
print(data.take(100))
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/131071.html
標籤:疑難問題
上一篇:陣列交并差
