修改SessionProcess为批量执行

d458386b · wuyunfeng · 1a0bc349 · d458386b
--- a/src/main/scala/com/session/SessionProcess.scala
+++ b/src/main/scala/com/session/SessionProcess.scala
@@ -51,8 +51,16 @@ object SessionProcess {
            val sessionProcess: SessionProcess = SessionProcess()
            //step1:获取源数据,重新分区,产生shuffle,Spark读Hive默认的分区数太少,并对数据去重
            var sourceDF: DataFrame = sessionProcess.sparkSession.sql(MyConfigSession.SOURCE_SQL+s" and created_day='${scnData}'").repartition(200).distinct()
+          var conditionGroup = List( "<='2'","between '3' and '5'","between '6' and '8'","between '9' and 'b'",">='c'" )
+          var dataCount = 0
+          var index = 0
+          sourceDF.persist(StorageLevel.MEMORY_AND_DISK_SER)
+          for(condition <- conditionGroup){
+            index += 1
+            val slideDF = sourceDF.where(s" SUBSTRING(pseudo_session,1,1)  ${condition}").repartition(100)
            //step2:抽取出当天pseudo_session对应的非空的device_token,doctor_id,mobile,补充到对应的pseudo_session下这几项为空的记录中
-            val groupRdd = sourceDF.rdd.groupBy(r => r.getAs[String]("pseudo_session"))
+            val groupRdd = slideDF.rdd.groupBy(r => r.getAs[String]("pseudo_session"))
            val resRdd = groupRdd.flatMap(g => {
              val pseudo_session = g._1
              val resList: ListBuffer[Row] = new ListBuffer[Row]()
@@ -106,11 +114,10 @@ object SessionProcess {
            val resDF = sessionProcess.sparkSession.createDataFrame(resRdd,sourceDF.schema)
            resDF.persist(StorageLevel.MEMORY_AND_DISK_SER)
            println("resDF.show=======>")
-//            resDF.show()
+            //            resDF.show()
            val data: RDD[Row] = resDF.rdd.mapPartitions(sessionProcess.filterRows)
            println("---------------------------------------process columns-------------------------------------------")
            val baseDF: DataFrame = data.mapPartitions(sessionProcess.processColumns)
              .toDF("pseudo_session", "user_id", "mobile", "device_token", "user_token", "view_class", "view_path",
                "action_type", "component_tag", "menu_code", "action_code", "position", "label_value","label_class","module_class1","module_class2", "app_version",
@@ -120,16 +127,20 @@ object SessionProcess {
            //默认缓存级别是:MEMORY_AND_DISK
            sessionIdDF.persist(StorageLevel.MEMORY_AND_DISK_SER)
            println("sessionIdDF.show=======>")
-//            sessionIdDF.show()
            println("-------------------------------match user_id 逻辑-------------------------------------------------")
            val dwFactLogSession: DataFrame = sessionProcess.matchUserId(sessionIdDF,sessionProcess.sparkSession,scnData)
            println("dwFactLogSession.show=======>")
-//            dwFactLogSession.show()
            println("-----------------create view fact_log_session and load to dw_fact_log_session--------------------")
            dwFactLogSession.createOrReplaceTempView("fact_log_session")
+            var insertMode = "insert overwrite"
+            if(index!=1){
+              insertMode = "insert into"
+            }
            val loadDataSql =
-                s"insert overwrite table ${MyConfigSession.HIVE_TABLE1} partition(created_day='${scnData}') select * from fact_log_session distribute by rand()"
+              s"${insertMode} table ${MyConfigSession.HIVE_TABLE1} partition(created_day='${scnData}') select * from fact_log_session distribute by rand()"
            sessionProcess.sparkSession.sql(loadDataSql)
+            dataCount = dataCount + dwFactLogSession.count().toInt
+          }
            println("----------------------------------update task record table---------------------------------------")
            //任务执行成功,更新 Mysql record 配置表
@@ -141,7 +152,7 @@ object SessionProcess {
            val upreSta: PreparedStatement = connSql.prepareStatement(updateSQL)
            upreSta.setString(1, "1")
            upreSta.setString(2, endTime)
-            upreSta.setInt(3, sessionIdDF.count().toInt)
+            upreSta.setInt(3, dataCount)
            //更新表数据
            upreSta.executeUpdate()
            //关闭连接