这个怎么样。
scala> val df = Seq ( ( Array("palo alto", "menlo park"), "Michael", Array(("stanford", 2010), ("berkeley", 2012))),
| (Array(("santa cruz")),"Andy",Array(("ucsb", 2011))),
| (Array(("portland")),"Justin",Array(("berkeley", 2014)))
| ).toDF("cities","name","schools")
df: org.apache.spark.sql.DataFrame = [cities: array<string>, name: string ... 1 more field]
scala> val df2 = df.select ("*").withColumn("sch1",df("schools._1"))
df2: org.apache.spark.sql.DataFrame = [cities: array<string>, name: string ... 2 more fields]
scala> val df3=df2.select("*").withColumn("sch2",concat_ws(",",df2("sch1")))
df3: org.apache.spark.sql.DataFrame = [cities: array<string>, name: string ... 3 more fields]
scala> df3.select("*").where( df3("sch2") rlike "^b|,b" ).show(false)
+-----------------------+-------+------------------------------------+--------------------+-----------------+
|cities |name |schools |sch1 |sch2 |
+-----------------------+-------+------------------------------------+--------------------+-----------------+
|[palo alto, menlo park]|Michael|[[stanford, 2010], [berkeley, 2012]]|[stanford, berkeley]|stanford,berkeley|
|[portland] |Justin |[[berkeley, 2014]] |[berkeley] |berkeley |
+-----------------------+-------+------------------------------------+--------------------+-----------------+
在另一个步骤中,您可以删除不需要的列。