spark 多個特征做onehot,我有50個多特征需要做onehot處理,怎么做效率高點?
uj5u.com熱心網友回復:
import sc.implicits._val vectorData = dataRDD
//將 列舉的值 轉化為 Double
.map( x => ( enum2Double("是否已流失",x._1), x._2(0) , x._2(1) ,x._2(2),x._2(3) ) )
//ml.feature.LabeledPoint
.toDF("loss","gender","age","grade","region")
//indexing columns
val stringColumns = Array("gender","age","grade","region")
val index_transformers: Array[org.apache.spark.ml.PipelineStage] = stringColumns.map(
cname => new StringIndexer()
.setInputCol(cname)
.setOutputCol(s"${cname}_index")
)
// Add the rest of your pipeline like VectorAssembler and algorithm
val index_pipeline = new Pipeline().setStages(index_transformers)
val index_model = index_pipeline.fit(vectorData)
val df_indexed = index_model.transform(vectorData)
//encoding columns
val indexColumns = df_indexed.columns.filter(x => x contains "index")
val one_hot_encoders: Array[org.apache.spark.ml.PipelineStage] = indexColumns.map(
cname => new OneHotEncoder()
.setInputCol(cname)
.setOutputCol(s"${cname}_vec")
)
val pipeline = new Pipeline().setStages(index_transformers ++ one_hot_encoders)
val model = pipeline.fit(vectorData)
model.transform(vectorData).select("loss","gender_index_vec","age_index_vec","grade_index_vec","region_index_vec")
.map (
x=>
ml.feature.LabeledPoint(x.apply(0).toString().toDouble ,ml.linalg.Vectors.dense(x.getAs[SparseVector] ("gender_index_vec").toArray++x.getAs[SparseVector]("age_index_vec").toArray++x.getAs[SparseVector]("grade_index_vec").toArray++x.getAs[SparseVector]("region_index_vec").toArray))
)
來源:
http://blog.csdn.net/pan_haufei/article/details/72903667
祝成功
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/58021.html
標籤:Spark
上一篇:求教虛擬機動態遷移如何截獲停機點(downtime)
下一篇:linux小白搭建yum倉庫報錯
