提交 c1230591 编写于 作者: zhenxin.ma's avatar zhenxin.ma

修改了逻辑

上级 a23da01d
...@@ -57,54 +57,8 @@ object SyncAttachRegionResult { ...@@ -57,54 +57,8 @@ object SyncAttachRegionResult {
val broadcast: Broadcast[Array[(Long, Long, Long, Long, Long)]] = val broadcast: Broadcast[Array[(Long, Long, Long, Long, Long)]] =
sparkSession.sparkContext.broadcast(ppact) sparkSession.sparkContext.broadcast(ppact)
val reDS: Dataset[(Long, Long, Long, Long, Long, Long)] = df1.mapPartitions(it => { //转换为字段名的DataFrame
//存储最终的结果,每行代表一个元组 val reDF: DataFrame = getDataFrame(df1,sparkSession,broadcast)
val tuples: ListBuffer[(Long, Long, Long, Long, Long, Long)] = ListBuffer[(Long, Long, Long, Long, Long, Long)]()
val list: List[Row] = it.toList
list.foreach(row => {
var count: Int = 0
//注意:这里有类型的转换
val project_id: Long = row.getAs[Int]("project_id").toLong
val doctor_id: Long = row.getAs[Int]("doctor_id").toLong
val province_id: Long = row.getAs[Long]("province_id")
val city_id: Long = row.getAs[Long]("city_id")
val county_id: Long = row.getAs[Long]("county_id")
val town_id: Long = row.getAs[Long]("town_id")
if (province_id != 0) {
count = count + 1
}
if (city_id != 0) {
count = count + 1
}
if (county_id != 0) {
count = count + 1
}
if (town_id != 0) {
count = count + 1
}
//通过广播变量,在广播变量中查看
val broad: Array[(Long, Long, Long, Long, Long)] = broadcast.value
broad.foreach(tuple => {
if (count == 0 && project_id == tuple._1) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 1 && project_id == tuple._1 && province_id == tuple._2) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 2 && project_id == tuple._1 && city_id == tuple._3) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 3 && project_id == tuple._1 && county_id == tuple._4) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 4 && project_id == tuple._1 && town_id == tuple._5) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
}
})
})
tuples.iterator
})
//转换为字段名
val reDF: DataFrame = reDS.toDF("project_id","doctor_id","province_id","city_id","county_id","town_id")
//写入到parquet文件中 //写入到parquet文件中
reDF.write.mode(SaveMode.Overwrite).format("parquet") reDF.write.mode(SaveMode.Overwrite).format("parquet")
.save(s"${SyncDataConfig.PARQUET_PATH}${SyncDataConfig.DATABASE2}.${SyncDataConfig.Hive_TABLE6}") .save(s"${SyncDataConfig.PARQUET_PATH}${SyncDataConfig.DATABASE2}.${SyncDataConfig.Hive_TABLE6}")
...@@ -138,4 +92,53 @@ object SyncAttachRegionResult { ...@@ -138,4 +92,53 @@ object SyncAttachRegionResult {
} }
} }
} }
def getDataFrame(data:DataFrame ,sparkSession: SparkSession,broadcast: Broadcast[Array[(Long, Long, Long, Long, Long)]]): DataFrame= {
import sparkSession.implicits._
val reDS: Dataset[(Long, Long, Long, Long, Long, Long)] = data.mapPartitions(it => {
//存储最终的结果,每行代表一个元组
val tuples: ListBuffer[(Long, Long, Long, Long, Long, Long)] = ListBuffer[(Long, Long, Long, Long, Long, Long)]()
val list: List[Row] = it.toList
list.foreach(row => {
var count: Int = 0
//注意:这里有类型的转换
val project_id: Long = row.getAs[Int]("project_id").toLong
val doctor_id: Long = row.getAs[Int]("doctor_id").toLong
val province_id: Long = row.getAs[Long]("province_id")
val city_id: Long = row.getAs[Long]("city_id")
val county_id: Long = row.getAs[Long]("county_id")
val town_id: Long = row.getAs[Long]("town_id")
if (province_id != 0) {
count = count + 1
}
if (city_id != 0) {
count = count + 1
}
if (county_id != 0) {
count = count + 1
}
if (town_id != 0) {
count = count + 1
}
//通过广播变量,在广播变量中查看
val broad: Array[(Long, Long, Long, Long, Long)] = broadcast.value
broad.foreach(tuple => {
if (count == 0 && project_id == tuple._1) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 1 && project_id == tuple._1 && province_id == tuple._2) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 2 && project_id == tuple._1 && city_id == tuple._3) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 3 && project_id == tuple._1 && county_id == tuple._4) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
} else if (count == 4 && project_id == tuple._1 && town_id == tuple._5) {
tuples += ((project_id, doctor_id, tuple._2, tuple._3, tuple._4, tuple._5))
}
})
})
tuples.iterator
})
val result: DataFrame = reDS.toDF("project_id","doctor_id","province_id","city_id","county_id","town_id")
result
}
} }
Markdown 格式
0% or
您添加了 0 到此讨论。请谨慎行事。
先完成此消息的编辑!
想要评论请 注册