Skip to content
项目
群组
代码片段
帮助
正在加载...
帮助
提交反馈
为 GitLab 提交贡献
登录
切换导航
S
study-report
项目
项目
详情
动态
版本
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
成员
成员
收起侧边栏
Close sidebar
动态
分支图
统计图
提交
打开侧边栏
zhenxin.ma
study-report
提交
c1230591
提交
c1230591
编写于
1月 17, 2020
作者:
zhenxin.ma
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
修改了逻辑
上级
a23da01d
变更
1
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
51 行增加
和
48 行删除
+51
-48
SyncAttachRegionResult.scala
src/main/scala/com/data/SyncAttachRegionResult.scala
+51
-48
未找到文件。
src/main/scala/com/data/SyncAttachRegionResult.scala
浏览文件 @
c1230591
...
@@ -57,54 +57,8 @@ object SyncAttachRegionResult {
...
@@ -57,54 +57,8 @@ object SyncAttachRegionResult {
val
broadcast
:
Broadcast
[
Array
[(
Long
,
Long
,
Long
,
Long
,
Long
)]]
=
val
broadcast
:
Broadcast
[
Array
[(
Long
,
Long
,
Long
,
Long
,
Long
)]]
=
sparkSession
.
sparkContext
.
broadcast
(
ppact
)
sparkSession
.
sparkContext
.
broadcast
(
ppact
)
val
reDS
:
Dataset
[(
Long
,
Long
,
Long
,
Long
,
Long
,
Long
)]
=
df1
.
mapPartitions
(
it
=>
{
//转换为字段名的DataFrame
//存储最终的结果,每行代表一个元组
val
reDF
:
DataFrame
=
getDataFrame
(
df1
,
sparkSession
,
broadcast
)
val
tuples
:
ListBuffer
[(
Long
,
Long
,
Long
,
Long
,
Long
,
Long
)]
=
ListBuffer
[(
Long
,
Long
,
Long
,
Long
,
Long
,
Long
)]()
val
list
:
List
[
Row
]
=
it
.
toList
list
.
foreach
(
row
=>
{
var
count
:
Int
=
0
//注意:这里有类型的转换
val
project_id
:
Long
=
row
.
getAs
[
Int
](
"project_id"
).
toLong
val
doctor_id
:
Long
=
row
.
getAs
[
Int
](
"doctor_id"
).
toLong
val
province_id
:
Long
=
row
.
getAs
[
Long
](
"province_id"
)
val
city_id
:
Long
=
row
.
getAs
[
Long
](
"city_id"
)
val
county_id
:
Long
=
row
.
getAs
[
Long
](
"county_id"
)
val
town_id
:
Long
=
row
.
getAs
[
Long
](
"town_id"
)
if
(
province_id
!=
0
)
{
count
=
count
+
1
}
if
(
city_id
!=
0
)
{
count
=
count
+
1
}
if
(
county_id
!=
0
)
{
count
=
count
+
1
}
if
(
town_id
!=
0
)
{
count
=
count
+
1
}
//通过广播变量,在广播变量中查看
val
broad
:
Array
[(
Long
,
Long
,
Long
,
Long
,
Long
)]
=
broadcast
.
value
broad
.
foreach
(
tuple
=>
{
if
(
count
==
0
&&
project_id
==
tuple
.
_1
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
1
&&
project_id
==
tuple
.
_1
&&
province_id
==
tuple
.
_2
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
2
&&
project_id
==
tuple
.
_1
&&
city_id
==
tuple
.
_3
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
3
&&
project_id
==
tuple
.
_1
&&
county_id
==
tuple
.
_4
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
4
&&
project_id
==
tuple
.
_1
&&
town_id
==
tuple
.
_5
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
})
})
tuples
.
iterator
})
//转换为字段名
val
reDF
:
DataFrame
=
reDS
.
toDF
(
"project_id"
,
"doctor_id"
,
"province_id"
,
"city_id"
,
"county_id"
,
"town_id"
)
//写入到parquet文件中
//写入到parquet文件中
reDF
.
write
.
mode
(
SaveMode
.
Overwrite
).
format
(
"parquet"
)
reDF
.
write
.
mode
(
SaveMode
.
Overwrite
).
format
(
"parquet"
)
.
save
(
s
"${SyncDataConfig.PARQUET_PATH}${SyncDataConfig.DATABASE2}.${SyncDataConfig.Hive_TABLE6}"
)
.
save
(
s
"${SyncDataConfig.PARQUET_PATH}${SyncDataConfig.DATABASE2}.${SyncDataConfig.Hive_TABLE6}"
)
...
@@ -138,4 +92,53 @@ object SyncAttachRegionResult {
...
@@ -138,4 +92,53 @@ object SyncAttachRegionResult {
}
}
}
}
}
}
def
getDataFrame
(
data
:
DataFrame
,
sparkSession
:
SparkSession
,
broadcast
:
Broadcast
[
Array
[(
Long
,
Long
,
Long
,
Long
,
Long
)]])
:
DataFrame
=
{
import
sparkSession.implicits._
val
reDS
:
Dataset
[(
Long
,
Long
,
Long
,
Long
,
Long
,
Long
)]
=
data
.
mapPartitions
(
it
=>
{
//存储最终的结果,每行代表一个元组
val
tuples
:
ListBuffer
[(
Long
,
Long
,
Long
,
Long
,
Long
,
Long
)]
=
ListBuffer
[(
Long
,
Long
,
Long
,
Long
,
Long
,
Long
)]()
val
list
:
List
[
Row
]
=
it
.
toList
list
.
foreach
(
row
=>
{
var
count
:
Int
=
0
//注意:这里有类型的转换
val
project_id
:
Long
=
row
.
getAs
[
Int
](
"project_id"
).
toLong
val
doctor_id
:
Long
=
row
.
getAs
[
Int
](
"doctor_id"
).
toLong
val
province_id
:
Long
=
row
.
getAs
[
Long
](
"province_id"
)
val
city_id
:
Long
=
row
.
getAs
[
Long
](
"city_id"
)
val
county_id
:
Long
=
row
.
getAs
[
Long
](
"county_id"
)
val
town_id
:
Long
=
row
.
getAs
[
Long
](
"town_id"
)
if
(
province_id
!=
0
)
{
count
=
count
+
1
}
if
(
city_id
!=
0
)
{
count
=
count
+
1
}
if
(
county_id
!=
0
)
{
count
=
count
+
1
}
if
(
town_id
!=
0
)
{
count
=
count
+
1
}
//通过广播变量,在广播变量中查看
val
broad
:
Array
[(
Long
,
Long
,
Long
,
Long
,
Long
)]
=
broadcast
.
value
broad
.
foreach
(
tuple
=>
{
if
(
count
==
0
&&
project_id
==
tuple
.
_1
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
1
&&
project_id
==
tuple
.
_1
&&
province_id
==
tuple
.
_2
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
2
&&
project_id
==
tuple
.
_1
&&
city_id
==
tuple
.
_3
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
3
&&
project_id
==
tuple
.
_1
&&
county_id
==
tuple
.
_4
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
else
if
(
count
==
4
&&
project_id
==
tuple
.
_1
&&
town_id
==
tuple
.
_5
)
{
tuples
+=
((
project_id
,
doctor_id
,
tuple
.
_2
,
tuple
.
_3
,
tuple
.
_4
,
tuple
.
_5
))
}
})
})
tuples
.
iterator
})
val
result
:
DataFrame
=
reDS
.
toDF
(
"project_id"
,
"doctor_id"
,
"province_id"
,
"city_id"
,
"county_id"
,
"town_id"
)
result
}
}
}
写
预览
Markdown
格式
0%
请重试
or
附加一个文件
附加文件
取消
您添加了
0
人
到此讨论。请谨慎行事。
先完成此消息的编辑!
取消
想要评论请
注册
或
登录