这样的操作是可能的
DF2
所以你可以使用
udf
:
import spark.implicits._
import org.apache.spark.sql.functions._
val df1 = Seq(
(Seq("k", "l"), 1), (Seq("m", "n"), 2), (Seq("o"), 3)
).toDF("key1", "value")
val df2 = Seq("k", "l", "m", "n", "o").toDF("key2")
val keys = df2.as[String].collect.map((_, 0)).toMap
val toKeyMap = udf((xs: Seq[String]) =>
xs.foldLeft(keys)((acc, x) => acc + (x -> 1)))
df1.select(toKeyMap($"key1").alias("key3"), $"value").show(false)
// +-------------------------------------------+-----+
// |key3 |value|
// +-------------------------------------------+-----+
// |Map(n -> 0, m -> 0, l -> 1, k -> 1, o -> 0)|1 |
// |Map(n -> 1, m -> 1, l -> 0, k -> 0, o -> 0)|2 |
// |Map(n -> 0, m -> 0, l -> 0, k -> 0, o -> 1)|3 |
// +-------------------------------------------+-----+
如果只需要一个字符串:
val toKeyMapString = udf((xs: Seq[String]) =>
xs.foldLeft(keys)((acc, x) => acc + (x -> 1))
.map { case (k, v) => s"$k: $v" }
.mkString(" ")
)
df1.select(toKeyMapString($"key1").alias("key3"), $"value").show(false)
// +------------------------+-----+
// |key3 |value|
// +------------------------+-----+
// |n: 0 m: 0 l: 1 k: 1 o: 0|1 |
// |n: 1 m: 1 l: 0 k: 0 o: 0|2 |
// |n: 0 m: 0 l: 0 k: 0 o: 1|3 |
// +------------------------+-----+