提交 cf1936af 编写于 作者: wuyunfeng's avatar wuyunfeng

新增两个流量分析表

上级 9c500371
......@@ -11,6 +11,7 @@ object MyConfigSession {
//Hive的DW层流量表
final val HIVE_TABLE1: String = "pica_dw.dw_fact_log_session"
final val HIVE_TABLE2: String = "pica_dw.dw_fact_log_session_path"
final val HIVE_TABLE3: String = "pica_dw.dw_fact_log_session_heart"
//写入的文件路径
final val PARQUET_PATH: String = "hdfs://bi-name1:8020/tmp/output/"
......@@ -24,10 +25,12 @@ object MyConfigSession {
//流量表中使用的三个字典表作为过滤数据条件
final val ACTION_TYPE_SQL: String = "select action_type,'1' as is_valid from pica_dw.dw_dim_log_action_type where is_valid=1"
final val ACTION_TYPE_SQL_HEART: String = "select action_type,'1' as is_valid from pica_dw.dw_dim_log_action_type where action_type='ACTION_HEART_BEAT'"
final val CLASS_NAME_SQL: String = "select class_name, '0' as is_valid from pica_dw.dw_dim_log_class_name where is_valid=0"
final val MENU_CODE_SQL: String = "select view_path, menu_code from pica_dw.dw_dim_log_menu_class_code where view_path is not Null"
//流量表中根据action_type获取对应的action_category类型
final val ACTION_CATEGORY_SQL: String = "select action_type,action_category from pica_dw.dw_dim_log_action_type where is_valid=1"
final val ACTION_CATEGORY_SQL_HEART: String = "select action_type,action_category from pica_dw.dw_dim_log_action_type where action_type='ACTION_HEART_BEAT'"
......
package com.session
import com.utils.UseUtil
import org.apache.spark.SparkConf
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
/**
* 处理结果存入 pica_dw.dw_fact_log_session_menu_calc
* @Author yunfeng.wu
* @Date 2020/06/12 10:23
* @Version 1.0
*/
class SessionMenuCalc extends Serializable{
def getSparkSession(appName: String): SparkSession = {
val conf: SparkConf = new SparkConf().setAppName(appName)
UseUtil.setConfigure(conf)
val sparkSession: SparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
sparkSession
}
val sparkSession: SparkSession = getSparkSession("SessionMenuCalc")
def handleByMcPart1(spark: SparkSession,createdDay:String) = {
var df = spark.sql("select cast(user_id as string),session_id,created_time,date_time, menu_code,refer_menu_code,action_code,refer_time_diff " +
s"from pica_dw.dw_fact_log_session_path where created_day='${createdDay}' and menu_code!='200' ")//and user_id='1000000186'
val groupRdd = df.rdd.groupBy(row => row.getAs[String]("user_id") + "_" + row.getAs[String]("session_id"))
val resRdd = groupRdd.flatMap(g => {
val user_session_id: String = g._1
val user_id = user_session_id.split("_")(0)
val session_id = user_session_id.split("_")(1)
var rowList: Iterable[Row] = g._2
//定义一个累加量
var before_menu = ""
var this_menu = ""
var series = 1
val list = new ListBuffer[Row]() //[(String,String,String,Integer,String,String,String)]
var mc_during_map = Map[String, Integer]()
var mc_time_map = Map[String, ArrayBuffer[String]]()
rowList = rowList.toList.sortBy(_.getAs[String]("created_time"))
rowList.foreach(row => {
this_menu = row.getAs[String]("menu_code")
val refer_time_diff = row.getAs[Integer]("refer_time_diff")
val created_time = row.getAs[String]("created_time")
var key = this_menu + "_" + series
if ("".equals(before_menu) || this_menu.equals(before_menu)) {
var sum_during: Integer = mc_during_map.getOrElse(key, 0)
mc_during_map.+=(key -> (refer_time_diff + sum_during))
before_menu = this_menu
} else {
series += 1
key = this_menu + "_" + series
var sum_during: Integer = mc_during_map.getOrElse(key, 0)
mc_during_map.+=(key -> (refer_time_diff + sum_during))
before_menu = this_menu
}
var time_arr: ArrayBuffer[String] = mc_time_map.getOrElse(key, new ArrayBuffer[String]())
time_arr += (created_time)
mc_time_map.+=(key -> time_arr)
})
mc_during_map.foreach(kv => {
val ar: ArrayBuffer[String] = mc_time_map.getOrElse(kv._1, new ArrayBuffer[String]())
// println((user_id, session_id, kv._1, kv._2, kv._1.split("_")(0), ar.toArray.min, ar.toArray.max))
list.append(Row(user_id, session_id, kv._1, kv._2, kv._1.split("_")(0),"", ar.toArray.min, ar.toArray.max))
})
list.toList
})
resRdd
}
def handleByMcPart2(spark: SparkSession, createdDay: String) = {
var df = spark.sql("select cast(user_id as string),session_id,created_time,date_time, menu_code,refer_menu_code,action_code,refer_time_diff " +
s"from pica_dw.dw_fact_log_session_path where created_day='${createdDay}' and menu_code ='200' ")
val groupRdd = df.rdd.groupBy(row => row.getAs[String]("user_id") + "_" + row.getAs[String]("session_id"))
val resRdd = groupRdd.flatMap(g => {
val user_session_id: String = g._1
val user_id = user_session_id.split("_")(0)
val session_id = user_session_id.split("_")(1)
var rowList: Iterable[Row] = g._2
//定义一个累加量
var before_action = ""
var this_action = ""
var series = 1
val list = new ListBuffer[Row]() //[(String,String,String,Integer,String,String,String)]
var ac_during_map = Map[String, Integer]()
var ac_time_map = Map[String, ArrayBuffer[String]]()
rowList = rowList.toList.sortBy(_.getAs[String]("created_time"))
rowList.foreach(row => {
this_action = row.getAs[String]("action_code")
val refer_time_diff = row.getAs[Integer]("refer_time_diff")
val created_time = row.getAs[String]("created_time")
var key = this_action + "_" + series
if ("".equals(before_action) || this_action.equals(before_action)) {
var sum_during: Integer = ac_during_map.getOrElse(key, 0)
ac_during_map.+=(key -> (refer_time_diff + sum_during))
before_action = this_action
} else {
series += 1
key = this_action + "_" + series
var sum_during: Integer = ac_during_map.getOrElse(key, 0)
ac_during_map.+=(key -> (refer_time_diff + sum_during))
before_action = this_action
}
var time_arr: ArrayBuffer[String] = ac_time_map.getOrElse(key, new ArrayBuffer[String]())
time_arr += (created_time)
ac_time_map.+=(key -> time_arr)
})
ac_during_map.foreach(kv => {
val ar: ArrayBuffer[String] = ac_time_map.getOrElse(kv._1, new ArrayBuffer[String]())
list.append(Row(user_id, session_id, "200_0", kv._2,"200", kv._1.split("_")(0), ar.toArray.min, ar.toArray.max))
})
list.toList
})
resRdd
}
}
object SessionMenuCalc {
def apply(): SessionMenuCalc = new SessionMenuCalc()
def main(args: Array[String]): Unit = {
if (args.length < 2) {
System.err.println("Usage: SessionMenuCalc <dbTable> <createdDay>")
System.exit(1)
}
val dbTable = args.apply(0)
val createdDay = args.apply(1)
println(s"dbTable:${dbTable},createdDay:${createdDay}")
val sessionMenuCalc: SessionMenuCalc = SessionMenuCalc()
val resRdd1 = sessionMenuCalc.handleByMcPart1(sessionMenuCalc.sparkSession,createdDay)
val resRdd2 = sessionMenuCalc.handleByMcPart2(sessionMenuCalc.sparkSession,createdDay)
val resRdd = resRdd1.union(resRdd2)
resRdd.take(20)
val resDf = sessionMenuCalc.sparkSession.createDataFrame(resRdd, StructType(
List(StructField("user_id", StringType, false),
StructField("session_id", StringType, false),
StructField("menu_code_term", StringType, false),
StructField("during_by_refer", IntegerType, false),
StructField("menu_code", StringType, false),
StructField("action_code", StringType, false),
StructField("begin_time", StringType, false),
StructField("end_time", StringType, false))
))
resDf.printSchema()
resDf.createOrReplaceTempView("session_menu_view_calc")
sessionMenuCalc.sparkSession.sql(s"insert overwrite table ${dbTable} partition(created_day='${createdDay}') " +
s"select cast(user_id as int) user_id,session_id,menu_code_term,during_by_refer,menu_code,action_code,begin_time,end_time from session_menu_view_calc")
sessionMenuCalc.sparkSession.close()
}
}
此差异已折叠。
package com.utils
import com.session.{SessionProcess, SessionProcessArgs, SessionProcessHistoryPathArgs, SessionProcessPath, SessionProcessPathArgs}
import com.session.{SessionMenuCalc, SessionProcess, SessionProcessArgs, SessionProcessHeart, SessionProcessHistoryPathArgs, SessionProcessPath, SessionProcessPathArgs}
import org.apache.hadoop.util.ProgramDriver
/**
......@@ -16,6 +16,8 @@ object Driver {
driver.addClass("SessionProcessArgs",classOf[SessionProcessArgs],"传递日期参数--用户Session数据分析导入到dw_fact_log_session表")
driver.addClass("SessionProcessPath",classOf[SessionProcessPath],"用户Session数据分析导入到dw_fact_log_session_path表")
driver.addClass("SessionProcessPathArgs",classOf[SessionProcessPathArgs],"传递日期参数--用户Session数据分析导入到dw_fact_log_session_path表")
driver.addClass("SessionProcessHeart",classOf[SessionProcessHeart],"用户Session数据分析导入到dw_fact_log_session_heart表")
driver.addClass("SessionMenuCalc",classOf[SessionMenuCalc],"传递日期参数--用户Session数据分析导入到dw_fact_log_session_menu_calc表")
driver.run(args)
}
}
Markdown 格式
0% or
您添加了 0 到此讨论。请谨慎行事。
先完成此消息的编辑!
想要评论请 注册