数据回流

脚本开发

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
-- 查看表结构
show create table stg.stg_base_info
desc stg.stg_base_info

-- 新增加表字段
ALTER TABLE stg.stg_base_info ADD COLUMNS (
temp_lease_area STRING COMMENT '临时租约面积',
temp_lease_rent STRING COMMENT '临时租约租金'
)

-- 新增表(stg、stg_dev)
create external table if not exists stg_base_info(
id string comment '主键',
***
) comment '基地信息表'
partitioned by (dt string comment '天分区', et string comment '库名称')
STORED AS TEXTFILE
TBLPROPERTIES (
'transactional' = 'false',
'parquet.compression' = 'snappy'
);

作业开发

数据数仓

脚本开发

1
2
3
4
5
6
7
8
9
10
11
12
13
create external table if not exists fct_*****(
id string comment '主键'
,****
,etl_time timestamp '数据更新时间'
)
comment "资产状态异动流水全量表"
partitioned by (dt string comment '天分区',et string comment '库名称') -- 根据实际情况是否需要分区表
stored as parquet
TBLPROPERTIES(
'transactional' = 'false',
'parquet.compression' = 'snappy'
);

作业开发

  1. 时间参数:

    众安 dataarts 说明 备注
    ${bizdate,yyyy-MM-dd,day,-,0} ${dt} T-1
    ${bizdate,yyyy-MM-dd,day,-,0} ${dt} T-1
    ${bizdate,yyyy-MM-dd,day,-,1} ${dt-1} T-2
    ${bizdate,yyyy-MM-dd,day,+,1} ${today} T(当天) 小时任务,写当天分区时使用${today}
  2. 运行程序参数 :
    –conf spark.name=ads_fin_comn_penalty_deduct_mf
    –conf spark.executor.memory=16G
    –conf spark.executor.cores=8
    –conf spark.num.executors=8
    –conf spark.sql.shuffle.partitions=128
    –conf spark.driver.memory=12G

  3. 依赖小时调度任务(注意:天调度任务依赖小时任务,依赖规则选择最近)

    1. 切换工作空间
    2. 选择依赖作业

数据消费

脚本开发

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
-- hive 建表规范
create external table if not exists rpt_*****(
id string comment '主键'
,****
,etl_time timestamp '数据更新时间'
)
comment "资产状态异动流水全量表"
partitioned by (dt string comment '天分区',et string comment '库名称') -- 根据实际情况是否需要分区表
stored as orc

-- doris 建表规范
SHOW TABLETS FROM your_table --查看实际 tablet 大小 每个 tablet 的压缩后数据大小(不含索引)保持在 1 GB 到 20 GB 之间
use dods; ----- 分区表
CREATE TABLE IF NOT EXISTS dods.dods_hive_ads_iot_formwork_warehouse_data_compare
(
saas_biz_date VARCHAR(2048) COMMENT "完成时间"
,diff_gen_num VARCHAR(2048) COMMENT "saas平台减智能平台根数"
,dt DATE COMMENT "按天分区"
)
ENGINE=OLAP
DUPLICATE KEY(saas_biz_date,saas_out_in_whs_type_name)
PARTITION BY RANGE (dt) ()
DISTRIBUTED BY HASH(saas_so_no) BUCKETS 1
PROPERTIES(
"dynamic_partition.time_unit" = "DAY"
,"dynamic_partition.start" = "-365"
,"dynamic_partition.end" = "3"
,"dynamic_partition.prefix" = "p"
,"dynamic_partition.buckets" = "1"
,"in_memory" = "false"
,"storage_format" = "V2"
,"dynamic_partition.create_history_partition" = "true"
,"dynamic_partition.history_partition_num" = "60"
,"compression"="snappy"
);
use dods; -- 非分区表
CREATE TABLE IF NOT EXISTS dods_hive_rpt_fin_turnover_lease_flow_dtl (
company_code VARCHAR(3048) NULL COMMENT "公司code"
,rantal_price DOUBLE NULL COMMENT "租赁单价"
,etl_time DATETIME NULL COMMENT "数据加载时间"
) ENGINE=OLAP
DUPLICATE KEY(company_code,company_name,jv_pro_line,dept_id
COMMENT '营业额_新租赁_明细表(财务口径)'
DISTRIBUTED BY HASH(company_code,company_name,jv_pro_line,dept_id) BUCKETS 4
PROPERTIES (
"in_memory" = "false"
,"storage_format" = "V2"
,"compression"="snappy"
);

作业开发

http请求方式推送 hive to fanruan

Rest Client 组件 参数配置

  • URL 地址
    http://bd-app.hongxinshop.com/api-task-schedule/api/finereport/open/update?platform=${fine_env}&packageName=工作过程&tbname=人力_员工工作计划_明细表
  • HTTP 方法GET
  • 是否需要判断返回值yes
  • 返回值字段路径code
  • 请求成功标志位200

集成作业方式推送 hive to doris

遇到如下报错建表语句替换为orc格式
img_v3_02124_45cf4e54-7f5c-4cd8-9964-e0c3e0bccd6g.jpg

元数据管理

img_v3_02124_06c1b22b-ba59-46ce-bfd8-1b732c926bbg.jpg

数据运维

切换到生产环境后查看运维概览、作业监控(补数据)、实例监控(通过运行状态找到运行失败的数据重跑)
image.jpg