package com.stream

import java.sql.Timestamp
import java.text.SimpleDateFormat
import java.util.Date

import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import com.common.DateUtil
import com.tmp.PropertiesUtil
import org.apache.hadoop.hive.ql.exec.UDF
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Row, SparkSession, types}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, TimestampType}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}

import scala.collection.mutable.{ArrayBuffer, ListBuffer}

/**
  * 消费kafka
  * @Author zhenxin.ma
  * @Date 2020/2/24 13:54
  * @Version 1.0
  */
object PraseLogRestore {
	var tableName = ""
	val traceFields = List(
		"package_id",
		"uuid",
		"device_token",
		"pseudo_session",
		"pseudo_id",
		"class_name",
		"action",
		"view_path",
		"component_tag",
		"created",
		"user_token",
		"mobile",
		"doctor_id",
		"device_brand",
		"device_model",
		"app_version",
		"device_type",
		"device_ip",
		"web_data",
		"web_data_type",
		"alternate_info",
		"extra_info",
		"network_type",
		"created_on",
		"remark1",
		"remark2",
		"remark3",
		"remark4",
		"remark5",
		"remark6",
		"remark7",
		"remark8",
		"remark9",
		"remark10",
		"user_token_tourist",
		"machineID",
		"serviceName",
		"serviceSidePacketId",
		"serviceSideRecordId")



	def main(args: Array[String]): Unit = {
		//    val warehouseLocation = "hdfs://master61:8020/user/hive/warehouse"
		//入口
		if (args.length < 1) {
			System.err.println("Usage: ParseLog <tableName>")
			System.exit(1)
		}
		tableName = args.apply(0)

		val conf: SparkConf = new SparkConf().setAppName("ParseLogRestore")
		conf.set("spark.serializer", classOf[KryoSerializer].getName)
		//启动反压机制
		conf.set("spark.streaming.backpressure.enabled","true")
		//启用反压机制时每个接收器接收第一批数据的初始最大速率
		conf.set("spark.streaming.backpressure.initialRate", "1000")
		//每秒钟从每个分区消费的最大数据
		conf.set("spark.streaming.kafka.maxRatePerPartition","1000")

		val spark = SparkSession
			.builder()
    		.config(conf)
			.enableHiveSupport()
			.getOrCreate()

		val ssc = new StreamingContext(spark.sparkContext,Seconds(PropertiesUtil.propertiesMap.get("kafka.during").toLong))
		val kafkaParams = Map(
			"bootstrap.servers" -> s"${PropertiesUtil.propertiesMap.get("kafka.brokers")}",
			"key.deserializer" -> classOf[StringDeserializer],
			"value.deserializer" -> classOf[StringDeserializer],
			"group.id" -> s"${PropertiesUtil.propertiesMap.get("kafka.groupId")}",
			//如果没有记录偏移量,就消费最早的数据
			"auto.offset.reset" -> "earliest",
			//      "auto.offset.reset" -> "latest",
			//spark 消费kafka中的偏移量自动维护: kafka 0.10之前的版本自动维护在zookeeper  kafka 0.10之后偏移量自动维护topic(__consumer_offsets)
			//开启自动维护偏移量
			"enable.auto.commit" ->  (true: java.lang.Boolean)
		)
		var topics = Array(s"${PropertiesUtil.propertiesMap.get("kafka.topic")}")

		//直连方式
		val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String,String](ssc,
			LocationStrategies.PreferConsistent,
			ConsumerStrategies.Subscribe[String,String](topics,kafkaParams))
		val dStream = stream.map(cr => {
			//      println("cr.value==>"+cr.value())
			cr.value()
		}).repartition(1)
			.foreachRDD(rs=> {
				val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
				var now = sdf.format(new Date().getTime)
//				var tday= now.split(" ").apply(0).trim
				var tday="2020-02-24"
				val time = now.split(" ").apply(1).trim
				if(time.startsWith("00:00")){//如果是0点0分,将该窗口数据写入上一天日期分区
					tday = sdf.format(new Date().getTime-10000).split(" ").apply(0).trim
				}
//				val path = s"/data/logs/trace_logs/${tday}/${new Date().getTime}"
				val path = s"/data/logs/trace_logs_tmp/${tday}/${new Date().getTime}"
				println(s"hdfs_path==>${path}")
				rs.saveAsTextFile(path) //使用gz格式对文件进行压缩
				val rdd = parseForeach(rs.filter(_.toString().contains("datas\\\":")))
				val df = createDf(spark,rdd)
				df.show()
//				println(s"df.count:${df.count()}")
				writeToHive(spark,df,tday)
			})
		ssc.start()
		ssc.awaitTermination()
	}

	def parseForeach(rs: RDD[String]) = {
		rs.flatMap(line=>{
			//去掉行首[行尾]
			var linex = line.toString()
			if(line.toString().endsWith("]") || line.toString().endsWith(")")){
				linex =  line.toString().dropRight(1)
			}
			val regex = "\"".r
			var jsonStr = linex.toString().replaceAll("\\\\","")
			jsonStr = regex.replaceFirstIn(jsonStr,"").dropRight(1)
			//      println(s"jsonStr:${jsonStr}")

			val lines =  new ListBuffer[Row]()

			try {
				val jsonObj = JSON.parseObject(jsonStr)
				if (jsonObj.containsKey("datas")) {
					val jsonArr: JSONArray = jsonObj.getJSONArray("datas")
					if (jsonArr.size() > 0) {
						for (i <- 0 to jsonArr.size() - 1) {
							val fieldValues = ArrayBuffer[Any]()
							fieldValues.append(0) //id值默认为0
							val eachJson: JSONObject = jsonArr.getJSONObject(i)
							for (field <- traceFields) {
								if (field.equals("created_on")) {
									fieldValues.append(new Timestamp(new Date().getTime()))
								} else if (eachJson.containsKey(field)) {
									fieldValues.append(eachJson.getString(field))
								} else {
									fieldValues.append("")
								}
							}
							lines.append(Row.fromSeq(fieldValues.toSeq))
						}
					}
				}
			} catch {
				case e:Exception  => println(s"parseException:${e.getMessage}===>jsonStr:${jsonStr}")
			}
			lines.toList
		})
	}


	def createDf(spark:SparkSession,rdd: RDD[Row]): DataFrame ={
		val schemaList = new ListBuffer[StructField]
		schemaList.append(StructField("id",IntegerType, false))
		traceFields.map(eachField=>{
			var struct:StructField = null
			if(eachField.equals("created_on")){
				struct = StructField(eachField, TimestampType, false)
			}else if(eachField.equals("id")){
				struct = StructField(eachField, IntegerType, false)
			}else {
				struct = StructField(eachField, StringType, false)
			}
			schemaList.append(struct)
		})
		val schema = types.StructType(schemaList.toList)
		val resDF = spark.createDataFrame(rdd,schema)
		//    resDF.printSchema()
		//    resDF.show(false)
		return resDF
	}

	def writeToHive(spark: SparkSession, df: DataFrame,tday: String ): Unit = {
		df.createOrReplaceTempView("picalog_trace_app_part")
		val sql = s"insert into ${tableName} partition(created_day='${tday}') select * from picalog_trace_app_part"
		println(s"[excute sql]:${sql}")
		spark.sql(sql)
	}


}
