com.databricks.spark.corenlp.functions - object, , . , :
: java.lang.Exception: com.databricks.spark.corenlp.functions
, invoke_static, :
invoke_static(sc,"com.databricks.spark.corenlp.functions", "cleanxml")
<jobj[15]>
org.apache.spark.sql.expressions.UserDefinedFunction
UserDefinedFunction(<function1>,StringType,Some(List(StringType)))
, README
df <- copy_to(sc, tibble(
id = 1,
text = "<xml>Stanford University is located in California. It is a great university.</xml>"
))
:
sdf_cleanxml <- function(df, input_col, output_col) {
sc <- df$src$con
clean_xml <- invoke_static(sc,"com.databricks.spark.corenlp.functions", "cleanxml")
arg <- list(invoke_static(sc, "org.apache.spark.sql.functions", "col", input_col))
expr <- invoke(clean_xml, "apply", arg)
df %>%
spark_dataframe() %>%
invoke("withColumn", output_col, expr) %>%
sdf_register()
}
:
sdf_cleanxml(df, "text", "text_clean")
# Source: spark<?> [?? x 3]
id text text_clean
<dbl> <chr> <chr>
1 1 <xml>Stanford University is located… Stanford University is located in …
:
register_core_nlp <- function(sc) {
funs <- c(
"cleanxml", "tokenize", "ssplit", "pos", "lemma", "ner", "depparse",
"coref", "natlog", "openie", "sentiment"
)
udf_registration <- sparklyr::invoke(sparklyr::spark_session(sc), "udf")
for (fun in funs) {
sparklyr::invoke(
udf_registration, "register", fun,
sparklyr::invoke_static(sc,"com.databricks.spark.corenlp.functions", fun)
)
}
}
register_core_nlp(sc)
SQL :
df %>%
transmute(doc = cleanxml(text)) %>%
transmute(sen = explode(ssplit(doc))) %>%
mutate(words = tokenize(sen), ner_tags = ner(sen), sentiment = sentiment(sen))
# Source: spark<?> [?? x 4]
sen words ner_tags sentiment
<chr> <list> <list> <int>
1 Stanford University is located in California . <list [7]> <list [7]> 1
2 It is a great university . <list [6]> <list [6]> 4