package com.utils

import java.sql.{Connection, DriverManager}
import java.util.Properties

import com.config.{MySQLConfig, SyncDataConfig}
import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

/**
  * @Author zhenxin.ma
  * @Date 2019/11/15 9:57
  * @Version 1.0
  */
object MyUtil {
	//连接MYSQL,将MYSQL数据导入到 Hive中
	def loadMysqlToHive(sparkSession: SparkSession, url: String, table: String, loadSQL: String): DataFrame = {
		//连接MYSQL,按照 id 进行分区
		val mappingData: DataFrame = sparkSession.read.format("jdbc")
			.option("driver", classOf[com.mysql.jdbc.Driver].getName)
			.option("url", url)
			.option("dbtable", table)
			.option("user", MySQLConfig.USER)
			.option("password", MySQLConfig.PSSWORD)
			.option("partitionColumn",SyncDataConfig.PARTITIONCOLUMN)
			.option("lowerBound",SyncDataConfig.LOWERBOUND)
			.option("upperBound",SyncDataConfig.UPPERBOUND)
			.option("numPartitions",SyncDataConfig.NUMPARTITIONS)
			.load()
		println("---------------- schema information---------------------")
		mappingData.printSchema()
		mappingData.createOrReplaceTempView(table)
		val df: DataFrame = sparkSession.sql(loadSQL)
		df
	}

	//Spark 任务相关的配置
	def setConfigure(conf: SparkConf): Unit = {
		conf.set("spark.serializer", classOf[KryoSerializer].getName)
		// 序列化时使用的内存缓冲区大小
		conf.set("spark.kryoserializer.buffer.max", "128m")
		// 启用rdd压缩
		conf.set("spark.rdd.compress", "true")
		// 设置压缩格式为snappy, 默认也就是lz4, 这种压缩格式压缩比高, 速度快, 但是耗费的内存相对也多一些
		conf.set("spark.io.compression.codec", "snappy")
		// 设置压缩时使用的内存缓冲区大小
		conf.set("spark.io.compression.snappy.blockSize", "64k")
		//调节持久化的内存比例
		conf.set("spark.memory.useLegacyMode", "true")
		conf.set("spark.storage.memoryFraction", "0.5")
		//设置shuffle过程中一个task拉取到上个stage的task的输出后，进行聚合操作时能够使用的Executor内存的比例，默认是0.2
		//shuffle操作比较多时,适当增加这个值,增加task执行需要的内存
		conf.set("spark.shuffle.memoryFraction","0.4")
		// spark sql 在shuffle时产生的partition数量, 默认是200
		conf.set("spark.sql.shuffle.partitions", "210")
		// SortShuffleManager开启by-pass(不需要排序)模式的阈值, 默认为200, 在partition数量小于这个值时会开启by-pass模式
		conf.set("spark.shuffle.sort.bypassMergeThreshold", "300")
	}


	/**
	  * 将DataFrame保存为Mysql表
	  * @param dataFrame 需要保存的dataFrame
	  * @param tableName 保存的mysql 表名
	  * @param saveMode  保存的模式 ：Append、Overwrite、ErrorIfExists、Ignore
	  * @param proPath   配置文件的路径
	  */
	def saveASMysqlTable(dataFrame: DataFrame, tableName: String, saveMode: SaveMode, proPath: String) = {
		var table = tableName
		val properties: Properties = getProPerties(proPath)
		val prop = new Properties //配置文件中的key 与 spark 中的 key 不同 所以创建prop 按照spark 的格式 进行配置数据库
		prop.setProperty("user", properties.getProperty("mysql.username"))
		prop.setProperty("password", properties.getProperty("mysql.password"))
		prop.setProperty("driver", properties.getProperty("mysql.driver"))
		prop.setProperty("url", properties.getProperty("mysql.url"))
		if (saveMode == SaveMode.Overwrite) {
			var conn: Connection = null
			try {
				conn = DriverManager.getConnection(
					prop.getProperty("url"),
					prop.getProperty("user"),
					prop.getProperty("password")
				)
				val stmt = conn.createStatement
				table = table.toUpperCase
				stmt.execute(s"truncate table $table") //此操作的目的是在覆盖的时候不删除原来的表，避免数据的类型全部变为TEXT类型
				conn.close()
			}
			catch {
				case e: Exception =>
					println("MySQL Error:")
					e.printStackTrace()
			}
		}
		dataFrame.write.mode(SaveMode.Append).jdbc(prop.getProperty("url"), tableName, prop)
	}

	/**
	  * 获取配置文件
	  * @param proPath	配置文件路径
	  * @return
	  */
	def getProPerties(proPath: String): Properties = {
		val properties: Properties = new Properties()
		properties.load(this.getClass.getResourceAsStream(proPath))
		properties
	}





}
