基于Hadoop生态圈的数据仓库实践进阶技术十三.docx
《基于Hadoop生态圈的数据仓库实践进阶技术十三.docx》由会员分享,可在线阅读,更多相关《基于Hadoop生态圈的数据仓库实践进阶技术十三.docx(17页珍藏版)》请在冰豆网上搜索。
基于Hadoop生态圈的数据仓库实践进阶技术十三
基于Hadoop生态圈的数据仓库实践——进阶技术(十三)
十三、无事实的事实表
本节讨论一种技术,用来处理源数据中没有度量的需求。
例如,产品源数据不包含产品数量信息,如果系统需要得到产品的数量,很显然不能简单地从数据仓库中直接得到。
这时就要用到无事实的事实表技术。
使用此技术可以通过持续跟踪产品的发布来计算产品的数量。
可以创建一个只有产品(计什么数)和日期(什么时候计数)维度代理键的事实表。
之所以叫做无事实的事实表是因为表本身并没有度量。
1.产品发布的无事实事实表
本小节说明如何实现一个产品发布的无事实事实表,包括新增和初始装载product_count_fact表。
下图显示了跟踪产品发布数量的数据仓库模式(只显示与product_count_fact表有关的表)。
执行下面的脚本创建产品发布日期视图和无事实事实表。
[sql]viewplaincopy在CODE上查看代码片派生到我的代码片
USEdw;
CREATEVIEWproduct_launch_date_dim
(product_launch_date_sk,
product_launch_date,
month_name,
month,
quarter,
year,
promo_ind)
AS
SELECTDISTINCT
date_sk,
date,
month_name,
month,
quarter,
year,
promo_ind
FROMproduct_dima,date_dimb
WHEREa.effective_date=b.date;
CREATETABLEproduct_count_fact(
product_skINT,
product_launch_date_skINT);
说明:
产品发布日期视图只取产品生效日期,并不是日期维度里的所有日期。
product_launch_date_dim维度表是日期维度表的子集。
2.初始装载product_count_fact表
下面的脚本从product_dim表向product_count_fact表装载已有的产品发布信息。
脚本里的insert添加所有产品的第一个版本(即产品的首次发布日期)。
这里使用Hive的窗口函数row_number正确地选取了产品发布时的生效日期,而不是一个SCD2行的生效日期。
[sql]viewplaincopy在CODE上查看代码片派生到我的代码片
usedw;
insertoverwritetableproduct_count_fact
selectproduct_sk,date_sk
from(selecta.product_skproduct_sk,
a.product_codeproduct_code,
b.date_skdate_sk,
row_number()over(partitionbya.product_codeorderbyb.date_sk)rn
fromproduct_dima,date_dimb
wherea.effective_date=b.date)t
wherern=1;
使用下面的语句查询product_count_fact表以确认正确执行了初始装载。
[sql]viewplaincopy在CODE上查看代码片派生到我的代码片
selectproduct_sk,product_launch_date_skfromdw.product_count_fact;
查询结果如下图所示。
3.修改定期装载脚本
修改了数据仓库模式后,还需要修改定期装载脚本。
该脚本在导入product_dim表后还要导入product_count_fact表。
下面显示了修改后的定期装载脚本。
实际上只是把上面的初始装载脚本加在了装载销售订单事实表之前。
[sql]viewplaincopy在CODE上查看代码片派生到我的代码片
--设置环境与时间窗口
!
run/root/set_time.sql
--装载customer维度
--设置已删除记录和地址相关列上SCD2的过期,用<=>运算符处理NULL值。
UPDATEcustomer_dim
SETexpiry_date=${hivevar:
pre_date}
WHEREcustomer_dim.customer_skIN
(SELECTa.customer_sk
FROM(SELECTcustomer_sk,
customer_number,
customer_street_address,
customer_zip_code,
customer_city,
customer_state,
shipping_address,
shipping_zip_code,
shipping_city,
shipping_state
FROMcustomer_dimWHEREexpiry_date=${hivevar:
max_date})aLEFTJOIN
rds.customerbONa.customer_number=b.customer_number
WHEREb.customer_numberISNULLOR
(!
(a.customer_street_address<=>b.customer_street_address)
OR!
(a.customer_zip_code<=>b.customer_zip_code)
OR!
(a.customer_city<=>b.customer_city)
OR!
(a.customer_state<=>b.customer_state)
OR!
(a.shipping_address<=>b.shipping_address)
OR!
(a.shipping_zip_code<=>b.shipping_zip_code)
OR!
(a.shipping_city<=>b.shipping_city)
OR!
(a.shipping_state<=>b.shipping_state)
));
--处理customer_street_addresses列上SCD2的新增行
INSERTINTOcustomer_dim
SELECT
ROW_NUMBER()OVER(ORDERBYt1.customer_number)+t2.sk_max,
t1.customer_number,
t1.customer_name,
t1.customer_street_address,
t1.customer_zip_code,
t1.customer_city,
t1.customer_state,
t1.shipping_address,
t1.shipping_zip_code,
t1.shipping_city,
t1.shipping_state,
t1.version,
t1.effective_date,
t1.expiry_date
FROM
(
SELECT
t2.customer_numbercustomer_number,
t2.customer_namecustomer_name,
t2.customer_street_addresscustomer_street_address,
t2.customer_zip_codecustomer_zip_code,
t2.customer_citycustomer_city,
t2.customer_statecustomer_state,
t2.shipping_addressshipping_address,
t2.shipping_zip_codeshipping_zip_code,
t2.shipping_cityshipping_city,
t2.shipping_stateshipping_state,
t1.version+1version,
${hivevar:
pre_date}effective_date,
${hivevar:
max_date}expiry_date
FROMcustomer_dimt1
INNERJOINrds.customert2
ONt1.customer_number=t2.customer_number
ANDt1.expiry_date=${hivevar:
pre_date}
LEFTJOINcustomer_dimt3
ONt1.customer_number=t3.customer_number
ANDt3.expiry_date=${hivevar:
max_date}
WHERE(!
(t1.customer_street_address<=>t2.customer_street_address)
OR!
(t1.customer_zip_code<=>t2.customer_zip_code)
OR!
(t1.customer_city<=>t2.customer_city)
OR!
(t1.customer_state<=>t2.customer_state)
OR!
(t1.shipping_address<=>t2.shipping_address)
OR!
(t1.shipping_zip_code<=>t2.shipping_zip_code)
OR!
(t1.shipping_city<=>t2.shipping_city)
OR!
(t1.shipping_state<=>t2.shipping_state)
)
ANDt3.customer_skISNULL)t1
CROSSJOIN
(SELECTCOALESCE(MAX(customer_sk),0)sk_maxFROMcustomer_dim)t2;
--处理customer_name列上的SCD1
--因为hive的update的set子句还不支持子查询,所以这里使用了一个临时表存储需要更新的记录,用先delete再insert代替update
--因为SCD1本身就不保存历史数据,所以这里更新维度表里的所有customer_name改变的记录,而不是仅仅更新当前版本的记录
DROPTABLEIFEXISTStmp;
CREATETABLEtmpAS
SELECT
a.customer_sk,
a.customer_number,
b.customer_name,
a.customer_street_address,
a.customer_zip_code,
a.customer_city,
a.customer_state,
a.shipping_address,
a.shipping_zip_code,
a.shipping_city,
a.shipping_state,
a.version,
a.effective_date,
a.expiry_date
FROMcustomer_dima,rds.customerb
WHEREa.customer_number=b.customer_numberAND!
(a.customer_name<=>b.customer_name);
DELETEFROMcustomer_dimWHEREcustomer_dim.customer_skIN(SELECTcustomer_skFROMtmp);
INSERTINTOcustomer_dimSELECT*FROMtmp;
--处理新增的customer记录
INSERTINTOcustomer_dim
SELECT
ROW_NUMBER()OVER(ORDERBYt1.customer_number)+t2.sk_max,
t1.customer_number,
t1.customer_name,
t1.customer_street_address,
t1.customer_zip_code,
t1.customer_city,
t1.customer_state,
t1.shipping_address,
t1.shipping_zip_code,
t1.shipping_city,
t1.shipping_state,
1,
${hivevar:
pre_date},
${hivevar:
max_date}
FROM
(
SELECTt1.*FROMrds.customert1LEFTJOINcustomer_dimt2ONt1.customer_number=t2.customer_number
WHEREt2.customer_skISNULL)t1
CROSSJOIN
(SELECTCOALESCE(MAX(customer_sk),0)sk_maxFROMcustomer_dim)t2;
--重载PA客户维度
TRUNCATETABLEpa_customer_dim;
INSERTINTOpa_customer_dim
SELECT
customer_sk
customer_number
customer_name
customer_street_address
customer_zip_code
customer_city
customer_state
shipping_address
shipping_zip_code
shipping_city
shipping_state
version
effective_date
expiry_date
FROMcustomer_dim
WHEREcustomer_state='PA';
--装载product维度
--设置已删除记录和product_name、product_category列上SCD2的过期
UPDATEproduct_dim
SETexpiry_date=${hivevar:
pre_date}
WHEREproduct_dim.product_skIN
(SELECTa.product_sk
FROM(SELECTproduct_sk,product_code,product_name,product_category
FROMproduct_dimWHEREexpiry_date=${hivevar:
max_date})aLEFTJOIN
rds.productbONa.product_code=b.product_code
WHEREb.product_codeISNULLOR(a.product_name<>b.product_nameORa.product_category<>b.product_category));
--处理product_name、product_category列上SCD2的新增行
INSERTINTOproduct_dim
SELECT
ROW_NUMBER()OVER(ORDERBYt1.product_code)+t2.sk_max,
t1.product_code,
t1.product_name,
t1.product_category,
t1.version,
t1.effective_date,
t1.expiry_date
FROM
(
SELECT
t2.product_codeproduct_code,
t2.product_nameproduct_name,
t2.product_categoryproduct_category,
t1.version+1version,
${hivevar:
pre_date}effective_date,
${hivevar:
max_date}expiry_date
FROMproduct_dimt1
INNERJOINrds.productt2
ONt1.product_code=t2.product_code
ANDt1.expiry_date=${hivevar:
pre_date}
LEFTJOINproduct_dimt3
ONt1.product_code=t3.product_code
ANDt3.expiry_date=${hivevar:
max_date}
WHERE(t1.product_name<>t2.product_nameORt1.product_category<>t2.product_category)ANDt3.product_skISNULL)t1
CROSSJOIN
(SELECTCOALESCE(MAX(product_sk),0)sk_maxFROMproduct_dim)t2;
--处理新增的product记录
INSERTINTOproduct_dim
SELECT
ROW_NUMBER()OVER(ORDERBYt1.product_code)+t2.sk_max,
t1.product_code,
t1.product_name,
t1.product_category,
1,
${hivevar:
pre_date},
${hivevar:
max_date}
FROM
(
SELECTt1.*FROMrds.productt1LEFTJOINproduct_dimt2ONt1.product_code=t2.product_code
WHEREt2.product_skISNULL)t1
CROSSJOIN
(SELECTCOALESCE(MAX(product_sk),0)sk_maxFROMproduct_dim)t2;
--装载product_count_fact表
insertoverwritetableproduct_count_fact
selectproduct_sk,date_sk
from(selecta.product_skproduct_sk,
a.product_codeproduct_code,
b.date_skdate_sk,
row_number()over(partitionbya.product_codeorderbyb.date_sk)rn
fromproduct_dima,date_dimb
wherea.effective_date=b.date)t
wherern=1;
--装载销售订单事实表
--前一天新增的销售订单
INSERTINTOsales_order_fact
SELECT
a.order_number,
customer_sk,
product_sk,
g.sales_order_attribute_sk,
e.order_date_sk,
null,
null,
null,
null,
null,
null,
null,
null,
f.request_delivery_date_sk,
order_amount,
quantity
FROM
rds.sales_ordera,
customer_dimc,
product_dimd,
order_date_dime,
request_delivery_date_dimf,
sales_order_attribute_dimg,
rds.cdc_timeh
WHERE
a.order_status='N'
ANDa.customer_number=c.customer_number
ANDa.status_date>=c.effective_date
ANDa.status_dateANDa.product_code=d.product_code
ANDa.status_date>=d.effective_date
ANDa.status_dateANDto_date(a.status_date)=e.order_date
ANDto_date(a.request_delivery_date)=f.request_delivery_date
ANDa.verification_ind=g.verification_ind
ANDa.credit_check_flag=g.credit_check_flag
ANDa.new_customer_ind=g.new_customer_ind
ANDa.web_order_flag=g.web_order_flag
ANDa.entry_date>=h.last_loadANDa.entry_date--处理分配库房、打包、配送和收货四个状态
DROPTABLEIFEXISTStmp;
CREATETABLEtmpAS
selectt0.order_numberorder_number,
t0.customer_skcustomer_sk,