提交 17d9e537 编写于 作者: wuyunfeng's avatar wuyunfeng

流量临时需求-新增label_class

上级 3f553a85
...@@ -9,6 +9,8 @@ package com.config ...@@ -9,6 +9,8 @@ package com.config
*/ */
object MyConfigSession { object MyConfigSession {
//Hive的DW层流量表 //Hive的DW层流量表
final val HIVE_TABLE1_TMP: String = "pica_dw.dw_fact_log_session_tmp"
final val HIVE_TABLE2_TMP: String = "pica_dw.dw_fact_log_session_path_tmp"
final val HIVE_TABLE1: String = "pica_dw.dw_fact_log_session" final val HIVE_TABLE1: String = "pica_dw.dw_fact_log_session"
final val HIVE_TABLE2: String = "pica_dw.dw_fact_log_session_path" final val HIVE_TABLE2: String = "pica_dw.dw_fact_log_session_path"
final val HIVE_TABLE3: String = "pica_dw.dw_fact_log_session_heart" final val HIVE_TABLE3: String = "pica_dw.dw_fact_log_session_heart"
...@@ -19,19 +21,18 @@ object MyConfigSession { ...@@ -19,19 +21,18 @@ object MyConfigSession {
//流量表中的临时视图 //流量表中的临时视图
final val VIEW_SESSION_ODS: String = "ods_session" final val VIEW_SESSION_ODS: String = "ods_session"
final val VIEW_SESSION_NO_MATCH: String = "ods_session_no_user_id" final val VIEW_SESSION_NO_MATCH: String = "ods_session_no_user_id"
final val VIEW_MOBILE_PHONE: String = "mobile_phone_match"
final val VIEW_EQUIPMENT_INFO: String = "equipment_info" final val VIEW_EQUIPMENT_INFO: String = "equipment_info"
final val VIEW_DEVICE_TOKEN: String = "device_token_match" final val VIEW_DEVICE_TOKEN: String = "device_token_match"
//流量表中使用的三个字典表作为过滤数据条件 //流量表中使用的三个字典表作为过滤数据条件
final val ACTION_TYPE_SQL: String = "select action_type,'1' as is_valid from pica_dw.dw_dim_log_action_type where is_valid=1" final val ACTION_TYPE_SQL: String = "select action_type,'1' as is_valid from pica_dw.dw_dim_log_action_type where is_valid=1"
final val ACTION_TYPE_SQL_HEART: String = "select action_type,'1' as is_valid from pica_dw.dw_dim_log_action_type where action_type='ACTION_HEART_BEAT'" final val ACTION_TYPE_SQL_HEART: String = "select action_type,'1' as is_valid from pica_dw.dw_dim_log_action_type where action_type='ACTION_HEART_BEAT'"
final val CLASS_NAME_SQL: String = "select class_name, '0' as is_valid from pica_dw.dw_dim_log_class_name where is_valid=0" final val CLASS_NAME_SQL: String = "select class_name, '0' as is_valid from pica_dw.dw_dim_log_class_name where is_valid=0"
final val MENU_CODE_SQL: String = "select view_path, menu_code from pica_dw.dw_dim_log_menu_class_code where view_path is not Null" final val MENU_CODE_SQL: String = "select distinct view_path, menu_code from pica_dw.dw_dim_log_menu_class_code where view_path is not Null"
//流量表中根据action_type获取对应的action_category类型 //流量表中根据action_type获取对应的action_category类型
final val ACTION_CATEGORY_SQL: String = "select action_type,action_category from pica_dw.dw_dim_log_action_type where is_valid=1" final val ACTION_CATEGORY_SQL: String = "select action_type,action_category from pica_dw.dw_dim_log_action_type where is_valid=1"
final val ACTION_CATEGORY_SQL_HEART: String = "select action_type,action_category from pica_dw.dw_dim_log_action_type where action_type='ACTION_HEART_BEAT'" final val ACTION_CATEGORY_SQL_HEART: String = "select action_type,action_category from pica_dw.dw_dim_log_action_type where action_type='ACTION_HEART_BEAT'"
final val ACTION_URLLABEL_SQL:String = "select url_content,label_value from pica_dw.dw_dim_log_action_urllabel "
//从源表pica_log.picalog_trace_app_part中执行SQL获取源数据,这里获取昨天的 //从源表pica_log.picalog_trace_app_part中执行SQL获取源数据,这里获取昨天的
...@@ -39,9 +40,10 @@ object MyConfigSession { ...@@ -39,9 +40,10 @@ object MyConfigSession {
""" """
|select pseudo_session,doctor_id,mobile,device_token,user_token_tourist,class_name,view_path,action, |select pseudo_session,doctor_id,mobile,device_token,user_token_tourist,class_name,view_path,action,
|component_tag,app_version,device_type,device_brand,device_model,network_type,created from pica_log.picalog_trace_app_part |component_tag,app_version,device_type,device_brand,device_model,network_type,created from pica_log.picalog_trace_app_part
| where created_day = DATE_SUB(current_date(),1) and pseudo_session is not null and pseudo_session !='' | where pseudo_session is not null and pseudo_session !=''
| and pseudo_id !='' and extra_info !='com.picahealth.patient' and serviceName != 'trace3' | and pseudo_id !='' and extra_info !='com.picahealth.patient' and serviceName != 'trace3'
""".stripMargin | and FROM_UNIXTIME(cast(substring(created,1,10) as bigint),'yyyy-MM-dd')=created_day and created is not null and created!=''
""".stripMargin //and `action`!='ACTION_EQUIP_INFO'
//从源表pica_log.picalog_trace_app_part中执行SQL获取源数据,输入指定日期参数时执行的SQL //从源表pica_log.picalog_trace_app_part中执行SQL获取源数据,输入指定日期参数时执行的SQL
...@@ -51,16 +53,18 @@ object MyConfigSession { ...@@ -51,16 +53,18 @@ object MyConfigSession {
|component_tag,app_version,device_type,device_brand,device_model,network_type,created from pica_log.picalog_trace_app_part |component_tag,app_version,device_type,device_brand,device_model,network_type,created from pica_log.picalog_trace_app_part
| where pseudo_session is not null and pseudo_session !='' | where pseudo_session is not null and pseudo_session !=''
| and pseudo_id !='' and extra_info !='com.picahealth.patient' and serviceName != 'trace3' | and pseudo_id !='' and extra_info !='com.picahealth.patient' and serviceName != 'trace3'
| and created is not null and created!='' and `action`!='ACTION_EQUIP_INFO'
""".stripMargin """.stripMargin
//从dw_fact_log_session表中筛选数据 //从dw_fact_log_session表中筛选数据
final val SOURCE_SQL_PATH: String = final val SOURCE_SQL_PATH: String =
s""" s"""
|select session_id,cast(user_id as int) user_id,action_type,user_token,menu_code,action_code,position,label_value, |select id log_session_id, session_id,user_id_int user_id,action_type,user_token,menu_code,action_code,position,label_value,label_class,
|app_version,device_type,created_time,date_time from ${MyConfigSession.HIVE_TABLE1} |app_version,device_type,created_time,date_time from ${MyConfigSession.HIVE_TABLE1}
| where created_day=DATE_SUB(current_date(),1) and app_version >= '3.1.7' | where app_version >= '3.1.7'
| AND ((action_type ='ACTION_CLICK') OR (action_type ='ACTION_VIEW' and menu_code != '0' and menu_code !='null' and menu_code !='')) | AND ((action_type ='ACTION_CLICK' and action_code != 'null' ) OR action_type ='ACTION_VIEW' )
| and (menu_code != '0' and menu_code !='null' and menu_code !='' and length(menu_code) <= 3 )
""".stripMargin """.stripMargin
...@@ -71,12 +75,22 @@ object MyConfigSession { ...@@ -71,12 +75,22 @@ object MyConfigSession {
s""" s"""
|SELECT t.session_id, COALESCE(cast(b.id as string),'0') AS user_id, t.mobile, t.device_token, t.user_token, |SELECT t.session_id, COALESCE(cast(b.id as string),'0') AS user_id, t.mobile, t.device_token, t.user_token,
|t.view_class,t.view_path,t.action_type,t.component_tag, t.menu_code, |t.view_class,t.view_path,t.action_type,t.component_tag, t.menu_code,
|t.action_code, t.position, t.label_value,t.app_version,t.device_type, |t.action_code, t.position, t.label_value,t.label_class,t.app_version,t.device_type,
|t.device_brand, t.device_model, t.device_system,t.net_type,t.created_time, |t.device_brand, t.device_model, t.device_system,t.net_type,t.created_time,
|t.date_time from ${MyConfigSession.VIEW_SESSION_ODS} as t |t.date_time from ${MyConfigSession.VIEW_SESSION_ODS} as t
|left join pica_ds.pica_doctor as b on t.user_id = cast(b.id as string) |left join pica_ds.pica_doctor as b on t.user_id = cast(b.id as string)
""".stripMargin """.stripMargin
//1.使用equipment表匹配,默认是昨天的 //1.针对没有匹配到的user_id,先使用 mobile_phone 进行匹配,得到 user_id 匹配,'0'
final val MOBILE_PHONE_SQL: String =
s"""
|SELECT ss.session_id, COALESCE(cast(b.id as string),'0') AS user_id, ss.mobile, ss.device_token, ss.user_token,
|ss.view_class,ss.view_path,ss.action_type,ss.component_tag, ss.menu_code,
|ss.action_code, ss.position,ss.label_value,ss.label_class,ss.app_version, ss.device_type,
|ss.device_brand, ss.device_model,ss.device_system,ss.net_type,ss.created_time,
|ss.date_time from ${MyConfigSession.VIEW_SESSION_NO_MATCH} AS ss
|left join (select distinct id,mobile_phone from pica_ds.pica_doctor where pica_doctor.delete_flag = 1 ) AS b on ss.mobile = b.mobile_phone
""".stripMargin
//2.使用equipment表匹配,默认是昨天的
final val EQUIPMENT_INFO_SQL: String = final val EQUIPMENT_INFO_SQL: String =
""" """
|SELECT a.user_id,a.device_token ,ROW_NUMBER() OVER ( PARTITION BY a.device_token ORDER BY a.creat_time DESC ) row_d |SELECT a.user_id,a.device_token ,ROW_NUMBER() OVER ( PARTITION BY a.device_token ORDER BY a.creat_time DESC ) row_d
...@@ -98,23 +112,13 @@ object MyConfigSession { ...@@ -98,23 +112,13 @@ object MyConfigSession {
s""" s"""
|SELECT t.session_id, COALESCE(cast(b.user_id as string),'0') AS user_id, t.mobile, t.device_token, t.user_token, |SELECT t.session_id, COALESCE(cast(b.user_id as string),'0') AS user_id, t.mobile, t.device_token, t.user_token,
|t.view_class,t.view_path,t.action_type,t.component_tag, t.menu_code, |t.view_class,t.view_path,t.action_type,t.component_tag, t.menu_code,
|t.action_code, t.position, t.label_value,t.app_version,t.device_type, |t.action_code, t.position, t.label_value,t.label_class,t.app_version,t.device_type,
|t.device_brand, t.device_model, t.device_system,t.net_type,t.created_time, |t.device_brand, t.device_model, t.device_system,t.net_type,t.created_time,
|t.date_time from ${MyConfigSession.VIEW_SESSION_NO_MATCH} as t |t.date_time from (select * from ${MyConfigSession.VIEW_MOBILE_PHONE} a where a.user_id= '0' ) as t
|left join ${MyConfigSession.VIEW_EQUIPMENT_INFO} as b on t.device_token = b.device_token |left join ${MyConfigSession.VIEW_EQUIPMENT_INFO} as b on t.device_token = b.device_token
""".stripMargin """.stripMargin
//在device_token匹配的基础上,筛选出没有匹配到的user_id,使用 mobile_phone 进行匹配,得到 user_id 匹配,'0'
final val MOBILE_PHONE_SQL: String =
s"""
|SELECT ss.session_id, COALESCE(cast(b.id as string),'0') AS user_id, ss.mobile, ss.device_token, ss.user_token,
|ss.view_class,ss.view_path,ss.action_type,ss.component_tag, ss.menu_code,
|ss.action_code, ss.position,ss.label_value,ss.app_version, ss.device_type,
|ss.device_brand, ss.device_model,ss.device_system,ss.net_type,ss.created_time,
|ss.date_time from (select * from ${MyConfigSession.VIEW_DEVICE_TOKEN} as a where a.user_id = '0') AS ss
|left join (select id,mobile_phone from pica_ds.pica_doctor where pica_doctor.delete_flag = 1 ) AS b on ss.mobile = b.mobile_phone
""".stripMargin
......
...@@ -2,13 +2,19 @@ package com.session ...@@ -2,13 +2,19 @@ package com.session
import java.sql import java.sql
import java.sql.PreparedStatement import java.sql.PreparedStatement
import com.config.MyConfigSession import com.config.MyConfigSession
import com.pica.utils.DateUtils import com.pica.utils.{DateUtils, StringUtils}
import com.utils.{JDBCUtil, UseUtil} import com.utils.{JDBCUtil, UseUtil}
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.{Window, WindowSpec} import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{lag, row_number} import org.apache.spark.sql.functions.{lag, row_number}
import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ListBuffer
import scala.util.control.Breaks.{break, breakable}
/** /**
...@@ -39,7 +45,11 @@ object SessionProcessPath { ...@@ -39,7 +45,11 @@ object SessionProcessPath {
|values(1968,'pica_dw.dw_fact_log_session_path','3',?,'0',?) |values(1968,'pica_dw.dw_fact_log_session_path','3',?,'0',?)
""".stripMargin """.stripMargin
//设置同步数据的批次号,格式是2019-09-12 //设置同步数据的批次号,格式是2019-09-12
val scnData: String = DateUtils.getYesterdayDate var scnData: String = DateUtils.getYesterdayDate
if(args.length>=1){
scnData = args(0)
}
println(s"scnData=${scnData}")
//设置任务开始时间,格式是2019-09-12 14:03:30 //设置任务开始时间,格式是2019-09-12 14:03:30
val startTime: String = DateUtils.getTodayTime val startTime: String = DateUtils.getTodayTime
//存储SQL中的参数 //存储SQL中的参数
...@@ -50,25 +60,75 @@ object SessionProcessPath { ...@@ -50,25 +60,75 @@ object SessionProcessPath {
val flag: Int = JDBCUtil.insertRecord(connSql, insertSQL, insertArr) val flag: Int = JDBCUtil.insertRecord(connSql, insertSQL, insertArr)
try { try {
val sparkSession: SparkSession = SessionProcessPath().getSparkSession("SessionProcessPath") val sparkSession: SparkSession = SessionProcessPath().getSparkSession("SessionProcessPath")
//获取position对应的label_value广播变量
val positionUrlLabelBroad = UseUtil.getBroadcast(sparkSession, MyConfigSession.ACTION_URLLABEL_SQL, "url_content", "label_value")
println(s"positionUrlLabelBroad=${positionUrlLabelBroad.value}")
//筛选源数据 //筛选源数据
val sourceDF: DataFrame = sparkSession.sql(MyConfigSession.SOURCE_SQL_PATH) val sourceDF: DataFrame = sparkSession.sql(MyConfigSession.SOURCE_SQL_PATH+s" and created_day='${scnData}'")
sourceDF.show()
//注册日期在流量统计日期之前的用户
val doctorDF: DataFrame = sparkSession.sql(
"select id from pica_ds.pica_doctor where to_date(creat_time) <=DATE_SUB(current_date(),1)")
sourceDF.join(doctorDF, sourceDF("user_id") === doctorDF("id"), "left")
.createOrReplaceTempView("tmp_table")
//将id为null的记录设置为0
val reSql: String = "select session_id,case when id is null then 0 else user_id END as user_id,action_type," +
"user_token,menu_code,action_code,position,label_value,app_version,device_type,created_time,date_time from tmp_table"
val selectDF: DataFrame = sparkSession.sql(reSql)
println("-----------------------------------compute refer columns-----------------------------------------") println("-----------------------------------compute refer columns-----------------------------------------")
val resultDF: DataFrame = getReferColumns(selectDF,sparkSession) val referResDF: DataFrame = getReferColumns(sourceDF,sparkSession)
println("referResDF.printSchema()")
referResDF.printSchema()
println("------------------------------------单独计算label_value----------------------------------------------")
//"menu_code = '930' and action_code IN ( '930000', '930001', '930002' ) and action_type = 'ACTION_CLICK'
val newLabelRdd: RDD[Row] = referResDF.where("menu_code = '930'").rdd.mapPartitions(rows=>{
// val rowList: ListBuffer[(String,String,Integer,String,String,String,String,String,String,String,String,String,String,String,
// Integer,Integer,String,String,String,String )] = new ListBuffer()
val rowList: ListBuffer[Row]= new ListBuffer[Row]()
val positionLabelMap: Map[String, String] = positionUrlLabelBroad.value
rows.toList.foreach(row=>{
val action_code = row.getAs[String]("action_code")
val action_type = row.getAs[String]("action_type")
val position = row.getAs[String]("position")
var label_value = ""
if(List("930000","930001","930002" ).contains(action_code) && "ACTION_CLICK".equals(action_type)){
breakable {
//利用position url_content映射表匹配
for (tuple <- positionLabelMap) {
if (StringUtils.getNotNullString(position).contains(tuple._1)) {
//满足条件后,修改源数据的label_value
label_value = tuple._2
println("--------------------menu_code match successfully-----------------------")
//结束遍历
break()
}
}
}
}
//经过上述匹配,如果label_value仍然为空串,那么置为原始值
if (label_value.equals("")) {
label_value = row.getAs[String]("label_value")
}
rowList.append( Row(
StringUtils.getNotNullString(row.getAs[String]("log_session_id")),
StringUtils.getNotNullString(row.getAs[String]("session_id")),
row.getAs[Integer]("user_id"),action_type,
StringUtils.getNotNullString(row.getAs[String]("user_token")),
StringUtils.getNotNullString(row.getAs[String]("menu_code")),
StringUtils.getNotNullString(row.getAs[String]("action_code")),
StringUtils.getNotNullString(row.getAs[String]("position")),
label_value,
StringUtils.getNotNullString(row.getAs[String]("label_class")),
StringUtils.getNotNullString(row.getAs[String]("refer_menu_code")),
StringUtils.getNotNullString(row.getAs[String]("refer_action_code")),
StringUtils.getNotNullString(row.getAs[String]("refer_position")),
StringUtils.getNotNullString(row.getAs[String]("refer_action_type")),
StringUtils.getNotNullString(row.getAs[String]("refer_created")),
row.getAs[Integer]("step_id"),
StringUtils.getNotNullString(row.getAs[String]("app_version")),
StringUtils.getNotNullString(row.getAs[String]("device_type")),
StringUtils.getNotNullString(row.getAs[String]("created_time")),
StringUtils.getNotNullString(row.getAs[String]("date_time")),
row.getAs[Double]("refer_time_diff")
// StringUtils.getNotNullString(row.getAs[String]("module_class1")),
// StringUtils.getNotNullString(row.getAs[String]("module_class2"))
))
})
rowList.iterator
})
val resultDF = sparkSession.createDataFrame(newLabelRdd,referResDF.schema).union(referResDF.where("menu_code != '930'"))
println("-----------------------------------load data to pica_dw.dw_fact_log_session_path-----------------") println("-----------------------------------load data to pica_dw.dw_fact_log_session_path-----------------")
loadData(resultDF,sparkSession,scnData) loadData(resultDF,sparkSession,scnData)
...@@ -128,7 +188,7 @@ object SessionProcessPath { ...@@ -128,7 +188,7 @@ object SessionProcessPath {
//去掉refer字段中的NULL值 //去掉refer字段中的NULL值
val coaleseDF: DataFrame = rowNumberDF.selectExpr( val coaleseDF: DataFrame = rowNumberDF.selectExpr(
"session_id", "user_id", "action_type", "user_token", "menu_code", "action_code", "position", "label_value", "log_session_id","session_id", "user_id", "action_type", "user_token", "menu_code", "action_code", "position", "label_value","label_class",
"COALESCE(refer_menu_code,'') as refer_menu_code", "COALESCE(refer_menu_code,'') as refer_menu_code",
"COALESCE(refer_action_code,'') as refer_action_code", "COALESCE(refer_action_code,'') as refer_action_code",
"COALESCE(refer_position,'') as refer_position", "COALESCE(refer_position,'') as refer_position",
...@@ -156,10 +216,11 @@ object SessionProcessPath { ...@@ -156,10 +216,11 @@ object SessionProcessPath {
val loadDataSql = val loadDataSql =
s""" s"""
|insert overwrite table ${MyConfigSession.HIVE_TABLE2} partition(created_day='${partitionDay}') |insert overwrite table ${MyConfigSession.HIVE_TABLE2} partition(created_day='${partitionDay}')
| select session_id,user_id,action_type,user_token,menu_code,action_code,position,label_value, | select log_session_id, session_id,user_id,action_type,user_token,menu_code,action_code,position,label_value,label_class,
| refer_menu_code,refer_action_code,refer_position,refer_action_type, | refer_menu_code,refer_action_code,refer_position,refer_action_type,
| cast(refer_time_diff as int) as refer_time_diff,step_id,app_version,device_type,created_time,date_time | cast(refer_time_diff as int) as refer_time_diff,
| from result_view | step_id,app_version,device_type,created_time,date_time,'' module_class1,'' module_class2
| from result_view distribute by rand()
""".stripMargin """.stripMargin
sparkSession.sql(loadDataSql) sparkSession.sql(loadDataSql)
} }
......
...@@ -53,28 +53,28 @@ object SessionProcessPathArgs { ...@@ -53,28 +53,28 @@ object SessionProcessPathArgs {
//筛选源数据 //筛选源数据
val sourceSql = val sourceSql =
s""" s"""
|select session_id,cast(user_id as int) user_id,action_type,user_token,menu_code,action_code,position,label_value, |select session_id,user_id_int user_id,action_type,user_token,menu_code,action_code,position,label_value,label_class,
|app_version,device_type,created_time,date_time from ${MyConfigSession.HIVE_TABLE1} |app_version,device_type,created_time,date_time from ${MyConfigSession.HIVE_TABLE1_TMP}
| where created_day='${args(0)}' and app_version >= '3.1.7' and menu_code !='null' and menu_code !='' | where created_day='${args(0)}' and app_version >= '3.1.7' and menu_code !='null' and menu_code !=''
| and ((action_type ='ACTION_VIEW' and menu_code != '0') or (action_type ='ACTION_CLICK' and action_code !='')) | and ((action_type ='ACTION_VIEW' and menu_code != '0') or (action_type ='ACTION_CLICK' and action_code !=''))
""".stripMargin """.stripMargin
val sourceDF: DataFrame = sparkSession.sql(sourceSql) val sourceDF: DataFrame = sparkSession.sql(sourceSql)
//注册日期在流量统计日期之前的用户 //注册日期在流量统计日期之前的用户
val doctorDF: DataFrame = sparkSession.sql( // val doctorDF: DataFrame = sparkSession.sql(
"select id from pica_ds.pica_doctor where to_date(creat_time) <=DATE_SUB(current_date(),1)") // "select id from pica_ds.pica_doctor where to_date(creat_time) <=DATE_SUB(current_date(),1)")
//
sourceDF.join(doctorDF, sourceDF("user_id") === doctorDF("id"), "left") // sourceDF.join(doctorDF, sourceDF("user_id") === doctorDF("id"), "left")
.createOrReplaceTempView("tmp_table") // .createOrReplaceTempView("tmp_table")
//将id为null的记录设置为0 //将id为null的记录设置为0
val reSql: String = "select session_id,case when id is null then 0 else user_id END as user_id,action_type," + // val reSql: String = "select session_id,case when id is null then 0 else user_id END as user_id,action_type," +
"user_token,menu_code,action_code,position,label_value,app_version,device_type,created_time,date_time from tmp_table" // "user_token,menu_code,action_code,position,label_value,app_version,device_type,created_time,date_time from tmp_table"
val selectDF: DataFrame = sparkSession.sql(reSql) // val selectDF: DataFrame = sparkSession.sql(reSql)
println("-----------------------------------compute refer columns-----------------------------------------") println("-----------------------------------compute refer columns-----------------------------------------")
val resultDF: DataFrame = getReferColumns(selectDF,sparkSession) val resultDF: DataFrame = getReferColumns(sourceDF,sparkSession)
println("-----------------------------------load data to pica_dw.dw_fact_log_session_path-----------------") println("-----------------------------------load data to pica_dw.dw_fact_log_session_path-----------------")
loadData(resultDF,sparkSession,scnData) loadData(resultDF,sparkSession,scnData)
...@@ -136,7 +136,7 @@ object SessionProcessPathArgs { ...@@ -136,7 +136,7 @@ object SessionProcessPathArgs {
//去掉refer字段中的NULL值 //去掉refer字段中的NULL值
val coaleseDF: DataFrame = rowNumberDF.selectExpr( val coaleseDF: DataFrame = rowNumberDF.selectExpr(
"session_id", "user_id", "action_type", "user_token", "menu_code", "action_code", "position", "label_value", "log_session_id","session_id", "user_id", "action_type", "user_token", "menu_code", "action_code", "position", "label_value","label_class",
"COALESCE(refer_menu_code,'') as refer_menu_code", "COALESCE(refer_menu_code,'') as refer_menu_code",
"COALESCE(refer_action_code,'') as refer_action_code", "COALESCE(refer_action_code,'') as refer_action_code",
"COALESCE(refer_position,'') as refer_position", "COALESCE(refer_position,'') as refer_position",
...@@ -163,10 +163,11 @@ object SessionProcessPathArgs { ...@@ -163,10 +163,11 @@ object SessionProcessPathArgs {
dataFrame.createOrReplaceTempView("result_view") dataFrame.createOrReplaceTempView("result_view")
val loadDataSql = val loadDataSql =
s""" s"""
|insert overwrite table ${MyConfigSession.HIVE_TABLE2} partition(created_day='${partitionDay}') |insert overwrite table ${MyConfigSession.HIVE_TABLE2_TMP} partition(created_day='${partitionDay}')
| select session_id,user_id,action_type,user_token,menu_code,action_code,position,label_value, | select log_session_id,session_id,user_id,action_type,user_token,menu_code,action_code,position,label_value,label_class,
| refer_menu_code,refer_action_code,refer_position,refer_action_type, | refer_menu_code,refer_action_code,refer_position,refer_action_type,
| cast(refer_time_diff as int) as refer_time_diff,step_id,app_version,device_type,created_time,date_time | cast(refer_time_diff as int) as refer_time_diff,
| step_id,app_version,device_type,created_time,date_time,'' module_class1,'' module_class2
| from result_view | from result_view
""".stripMargin """.stripMargin
sparkSession.sql(loadDataSql) sparkSession.sql(loadDataSql)
......
package com.session
import java.sql
import java.sql.PreparedStatement
import com.config.MyConfigSession
import com.pica.utils.DateUtils
import com.utils.{JDBCUtil, UseUtil}
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{lag, row_number}
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* 处理昨天的数据,导入到pica_dw.dw_fact_log_session_path表
* @Author zhenxin.ma
* @Date 2020/3/27 10:58
* @Version 1.0
*/
class SessionProcessTerm {
def getSparkSession(appName: String): SparkSession = {
val conf: SparkConf = new SparkConf().setAppName(appName)
UseUtil.setConfigure(conf)
val sparkSession: SparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
sparkSession
}
}
object SessionProcessTerm {
def apply(): SessionProcessTerm = new SessionProcessTerm()
def main(args: Array[String]): Unit = {
//1.执行任务之前先往record表记录
val insertSQL: String =
s"""
|insert into ${MyConfigSession.DATA_BASE}.${MyConfigSession.JDBC_TABLE} (job_id,job_name,job_type,job_scn,status,start_time)
|values(1968,'pica_dw.dw_fact_log_session_path','3',?,'0',?)
""".stripMargin
//设置同步数据的批次号,格式是2019-09-12
val scnData: String = DateUtils.getYesterdayDate
//设置任务开始时间,格式是2019-09-12 14:03:30
val startTime: String = DateUtils.getTodayTime
//存储SQL中的参数
val insertArr: Array[String] = Array[String](scnData, startTime)
//获取MYSQL连接
val connSql: sql.Connection = JDBCUtil.getConnection()
//向 record 表插入数据
val flag: Int = JDBCUtil.insertRecord(connSql, insertSQL, insertArr)
try {
val sparkSession: SparkSession = SessionProcessTerm().getSparkSession("SessionProcessTerm")
//筛选源数据
val sourceDF: DataFrame = sparkSession.sql(MyConfigSession.SOURCE_SQL_PATH)
//注册日期在流量统计日期之前的用户
val doctorDF: DataFrame = sparkSession.sql(
"select id from pica_ds.pica_doctor where to_date(creat_time) <=DATE_SUB(current_date(),1)")
sourceDF.join(doctorDF, sourceDF("user_id") === doctorDF("id"), "left")
.createOrReplaceTempView("tmp_table")
//将id为null的记录设置为0
val reSql: String = "select session_id,case when id is null then 0 else user_id END as user_id,action_type," +
"user_token,menu_code,action_code,position,label_value,app_version,device_type,created_time,date_time from tmp_table"
val selectDF: DataFrame = sparkSession.sql(reSql)
println("-----------------------------------compute refer columns-----------------------------------------")
val resultDF: DataFrame = getReferColumns(selectDF,sparkSession)
println("-----------------------------------load data to pica_dw.dw_fact_log_session_path-----------------")
loadData(resultDF,sparkSession,scnData)
println("----------------------------------update task record table---------------------------------------")
//任务执行成功,更新 Mysql record 配置表
val updateSQL: String =
s"""
|update ${MyConfigSession.JDBC_TABLE} set status=?,end_time=?,data_count=? where job_id=1968 and start_time='${startTime}'
""".stripMargin
val upreSta: PreparedStatement = connSql.prepareStatement(updateSQL)
upreSta.setString(1, "1")
upreSta.setString(2, DateUtils.getTodayTime)
upreSta.setInt(3, resultDF.count().toInt)
//更新表数据
upreSta.executeUpdate()
//关闭连接
JDBCUtil.close(connSql, upreSta)
sparkSession.stop()
}catch {
case e:Exception => {
println("-----------------------------------任务异常---------------------------------------------------")
val exceptionSQL: String =
s"""
|update ${MyConfigSession.JDBC_TABLE} set status=?,exception=?,end_time=? where job_id=1968 and start_time='${startTime}'
""".stripMargin
val errorArr = Array[String]("2", e.getMessage, DateUtils.getTodayTime)
JDBCUtil.insertRecord(connSql, exceptionSQL, errorArr)
connSql.close()
}
}
}
/**
* @Description 获取需要的字段的refer字段
* @param dataFrame 源数据
* @param sparkSession SparkSession 环境
* @return org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>
**/
def getReferColumns(dataFrame: DataFrame ,sparkSession: SparkSession):DataFrame = {
//先按照 session_id分区,再按照 created_time排序,进行窗口计算
val sessionIDWinSpec: WindowSpec = Window.partitionBy("session_id").orderBy("created_time")
//增叫refer_字段
val menuDF: DataFrame =
dataFrame.withColumn("refer_menu_code", lag(dataFrame("menu_code"), 1).over(sessionIDWinSpec))
val acodeDF: DataFrame =
menuDF.withColumn("refer_action_code", lag(menuDF("action_code"), 1).over(sessionIDWinSpec))
val positionDF: DataFrame =
acodeDF.withColumn("refer_position", lag(acodeDF("position"), 1).over(sessionIDWinSpec))
val actypeDF: DataFrame =
positionDF.withColumn("refer_action_type", lag(positionDF("action_type"), 1).over(sessionIDWinSpec))
val recreatDF: DataFrame =
actypeDF.withColumn("refer_created", lag(actypeDF("created_time"), 1).over(sessionIDWinSpec))
val rowNumberDF: DataFrame =
recreatDF.withColumn("step_id", row_number().over(sessionIDWinSpec))
//去掉refer字段中的NULL值
val coaleseDF: DataFrame = rowNumberDF.selectExpr(
"session_id", "user_id", "action_type", "user_token", "menu_code", "action_code", "position", "label_value",
"COALESCE(refer_menu_code,'') as refer_menu_code",
"COALESCE(refer_action_code,'') as refer_action_code",
"COALESCE(refer_position,'') as refer_position",
"COALESCE(refer_action_type,'') as refer_action_type",
"COALESCE(refer_created,created_time) as refer_created",
"step_id", "app_version", "device_type", "created_time", "date_time")
//在此基础上增加字段 refer_time_diff,值为 created_time, refer_created 之差
val referTimeDiff: DataFrame =
coaleseDF.withColumn("refer_time_diff", coaleseDF("created_time") - coaleseDF("refer_created"))
referTimeDiff
}
/**
* @Description 导入数据到表中
* @param dataFrame 源数据
* @param sparkSession SparkSession 环境
* @param partitionDay 分区日期
* @return void
**/
def loadData(dataFrame: DataFrame, sparkSession: SparkSession, partitionDay:String):Unit = {
dataFrame.createOrReplaceTempView("result_view")
val loadDataSql =
s"""
|insert overwrite table ${MyConfigSession.HIVE_TABLE2} partition(created_day='${partitionDay}')
| select session_id,user_id,action_type,user_token,menu_code,action_code,position,label_value,
| refer_menu_code,refer_action_code,refer_position,refer_action_type,
| cast(refer_time_diff as int) as refer_time_diff,step_id,app_version,device_type,created_time,date_time
| from result_view
""".stripMargin
sparkSession.sql(loadDataSql)
}
}
Markdown 格式
0% or
您添加了 0 到此讨论。请谨慎行事。
先完成此消息的编辑!
想要评论请 注册