我有两个spark数据集,一个带有accountid和key列,key列的格式为数组[key1,key2,key3..]另一个数据集有两列accountid和键值,它们是json格式的。accountid,{key:value,key,value…}。如果在第一个数据集中accountid出现键,我需要更新第二个数据集中的值。
import org.apache.spark.sql.functions._
val df= sc.parallelize(Seq(("20180610114049", "id1","key1"),
("20180610114049", "id2","key2"),
("20180610114049", "id1","key1"),
("20180612114049", "id2","key1"),
("20180613114049", "id3","key2"),
("20180613114049", "id3","key3")
)).toDF("date","accountid", "key")
val gp=df.groupBy("accountid","date").agg(collect_list("key"))
+---------+--------------+-----------------+
|accountid| date|collect_list(key)|
+---------+--------------+-----------------+
| id2|20180610114049| [key2]|
| id1|20180610114049| [key1, key1]|
| id3|20180613114049| [key2, key3]|
| id2|20180612114049| [key1]|
+---------+--------------+-----------------+
val df2= sc.parallelize(Seq(("20180610114049", "id1","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180610114049", "id2","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180611114049", "id1","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180612114049", "id2","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180613114049", "id3","{'key1':'0.0','key2':'0.0','key3':'0.0'}")
)).toDF("date","accountid", "result")
+--------------+---------+----------------------------------------+
|date |accountid|result |
+--------------+---------+----------------------------------------+
|20180610114049|id1 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180610114049|id2 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180611114049|id1 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180612114049|id2 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180613114049|id3 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
+--------------+---------+----------------------------------------+
预期产量
+--------------+---------+----------------------------------------+
|date |accountid|result |
+--------------+---------+----------------------------------------+
|20180610114049|id1 |{'key1':'1.0','key2':'0.0','key3':'0.0'}|
|20180610114049|id2 |{'key1':'0.0','key2':'1.0','key3':'0.0'}|
|20180611114049|id1 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180612114049|id2 |{'key1':'1.0','key2':'0.0','key3':'0.0'}|
|20180613114049|id3 |{'key1':'0.0','key2':'1.0','key3':'1.0'}|
+--------------+---------+----------------------------------------+