Since it OneHotEncoder
does not accept an empty string for the name, or you get the following error:
java.lang.IllegalArgumentException: : . scala.Predef $.require(Predef.scala: 233) at org.apache.spark.ml.attribute.Attribute $$ anonfun $5.apply(attributes.scala: 33) at org.apache.spark.ml.attribute.Attribute $$ anonfun $5.apply(attributes.scala: 32) [...]
: ( , rf. @Anthony)
UDF
:
import org.apache.spark.sql.functions._
def processMissingCategory = udf[String, String] { s => if (s == "") "NA" else s }
UDF :
val df = sqlContext.createDataFrame(Seq(
(0, "a"),
(1, "b"),
(2, "c"),
(3, ""), //<- original example has "a" here
(4, "a"),
(5, "c")
)).toDF("id", "category")
.withColumn("category",processMissingCategory('category))
df.show
// +---+--------+
// | id|category|
// +---+--------+
// | 0| a|
// | 1| b|
// | 2| c|
// | 3| NA|
// | 4| a|
// | 5| c|
// +---+--------+
val indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)
val indexed = indexer.transform(df)
indexed.show
val encoder = new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec")
val encoded = encoder.transform(indexed)
encoded.show
EDIT:
@Anthony Scala:
df.na.replace("category", Map( "" -> "NA")).show
, !