一、one_id概述
用戶畫像專案有兩個核心內容:用戶畫像標簽和用戶one_id,
用戶畫像標簽體系,是波士頓咨詢公司的團隊幫忙搭建的,而我本人則是負責one_id的設計和生成,one_id,是用戶唯一標識,用于海量資料中識別出同一個人,目前市面上有兩種one_id:無中生有的one_id和基于現有用戶資料生成的one_id,后者是前者的子集,
-
無中生有的one_id,就是基于用戶訪問資料:
- 獲取用戶的(上網設備)設備號、ip地址等生成one_id
- 基于用戶注冊后留下手機號、身份證等基本資訊,將不同的one_id識別成同一個人,進一步融合成新的one_id,
【難點】:無中生有的one_id的技術難點在于如何獲取用戶設備號,可是現在很多設備商都不讓你獲取設備號了,
-
而我們用戶畫像專案組的one_id,則是基于現有的用戶資訊,去生成的one_id,
- 現有的資訊包括:
1、線索資料:汽車垂直媒體獲取到的用戶線索,線索資料基本是外部資料,
2、潛客資料:用戶通過線索來到線下門店或自來客的用戶資料
3、車主資料:購車資料、車主認證資料、APP注冊資料、續保資料、維修保養資料等等
【難點】:這個專案的難點在于,如何將多條用戶資料識別成同一個人,很多人馬上就想到了,通過身份證號,那沒有身份證號的呢,怎么識別成同一個人?沒有身份證號,那咱們就給他造一個“身份證號”——phone_id,具體怎么操作,請繼續往下看,
- 現有的資訊包括:
-
one_id維度表表結構
CREATE TABLE IF NOT EXISTS dw.dim_one_id(
sk_id bigint COMMENT "自增主鍵"
,phone string COMMENT "該表唯一手機號"
,one_id bigint COMMENT "用戶唯一標識"
,nature string COMMENT "屬性:個人/組織"
,one_id_role string COMMENT "one-id角色:線索、潛客、車主"
,user_name_first string COMMENT "同一手機號取表優先級最高的名字"
,user_name string COMMENT "同一證件/手機號名字頻次最高的名字"
,sex string COMMENT "性別"
,user_license string COMMENT "證件號"
,user_birth string COMMENT "出生日期"
,user_email string COMMENT "郵件地址"
,user_qq string COMMENT "QQ"
,user_wx string COMMENT "微信"
,user_addr string COMMENT "住址"
,source_list string COMMENT "資料源串列"
,source_pd_list string COMMENT "主鍵串列"
)
COMMENT 'dw層--one_id維度表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
二、one_id的目的和本質
one_id的目的就是,通過one_id去觸達更多的客戶,通過什么去觸達呢?手機號,所以one_id的本質就是“電話簿”,
三、one_id的基本假設
one_id的基本假設,其實就是從one_id的本質抽象過來的,one_id的基本假設就是“電話簿”的特征,
- 一個手機號只能觸達一個人,
- 一個人可以擁有多個手機號,
one_id的基本假設,同時也是one_id的資料清洗規則和融合邏輯,
四、one_id的源資料問題
- 一個手機號 有多個用戶名稱 或 多個證件號,
- 一個證件號 有多個用戶名稱,
- 只有一成資料有身份證號(包括重復證件號,如20個人擁有同一個證件號)
五、資料的可信度
假如同一個手機號,該用戶在“懂車帝”上留的用戶名是“妮可羅賓小可愛”,而他在購車(實銷訂單)的時候留的用戶名是“李建鋼”,你覺得,他在哪里留的資訊可信度比較高?那肯定是購車的時候!
所以基于資訊可信度,我會給資料來源表,分優先級1、2、3…數字越小,可信度越高,而我給優先級的邏輯是這樣:
- 根據用戶運營流程,給”資料來源表“標記:拉新、留存、轉化,把來源表分成三類,資訊可信度:轉化>留存>拉新
- 基于步驟1得到的分類,然后在類里面進行排序,先給”轉化類“的來源表進行排序,其次是”留存類“,最后才是“拉新類”,
注:排了表的優先級,一定要跟業務方同步和調整,
六、資料清洗
1、去重
- 源資料匯總中間表
CREATE TABLE IF NOT EXISTS ods.etl_one_id1 (
source STRING COMMENT '資料源'
, source_level INT COMMENT '表優先級'
, source_pd STRING COMMENT '該條資料在源資料的主鍵'
, user_name STRING COMMENT '姓名'
, sex STRING COMMENT '性別'
, phone1 STRING COMMENT '手機號1'
, phone2 STRING COMMENT '手機號2'
, user_license STRING COMMENT '證件號'
, user_birth STRING COMMENT '出生日期'
, user_email STRING COMMENT '郵件地址'
, user_qq STRING COMMENT 'QQ'
, user_wx STRING COMMENT '微信'
, user_addr STRING COMMENT '住址'
, etl_dt STRING COMMENT 'ETL時間'
)
PARTITIONED BY (TBL_NAME STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
- 源資料去重
CREATE TABLE ODS.ETL_ONE_ID1A
AS
SELECT
*
FROM
(
SELECT
source
, source_level
, source_pd
, user_name
, sex
, phone1
, phone2
, user_license
, user_birth
, user_email
, user_qq
, user_wx
, user_addr
,row_number() over (partition by source,source_pd order by COALESCE(phone1 ,phone2) desc) as rn
FROM
ODS.ETL_ONE_ID1
where phone1 is not null OR phone2 IS NOT NULL
) t
WHERE RN=1
;
2、標記來源表“資訊可信度優先級”
- 表優先級維表
CREATE TABLE IF NOT EXISTS dw.dim_table_level (
source STRING COMMENT '資料源--表名'
, source_level INT COMMENT '表優先級'
)
COMMENT 'dw層--來源表優先級 維度表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
INSERT INTO TABLE dw.dim_table_level VALUES
('某4S店銷售訂單表',2)
,('某汽車APP車主認證表',1)
,('某4S店訪客登記表',3)
,('懂車帝用戶表',4)
,('易車網用戶表',5)
;
- 獲得優先級
--源表給定【表優先順序】
CREATE TABLE ODS.ETL_ONE_ID1B
AS
SELECT
A.source
, B.source_level
, A.source_pd
, A.user_name
, A.sex
, A.phone1
, A.phone2
, A.user_license
, A.user_birth
, A.user_email
, A.user_qq
, A.user_wx
, A.user_addr
FROM
ods.ETL_ONE_ID1A A
LEFT JOIN
dw.dim_table_level B
ON A.source=B.source
;在這里插入代碼片
3、空字串置為NULL
- 目標表:ODS.ETL_ONE_ID2
CREATE TABLE ODS.ETL_ONE_ID2
AS
SELECT
source
,source_level
,IF(LENGTH(source_pd)=0,NULL,source_pd) AS source_pd
,IF(LENGTH(user_name)=0,NULL,user_name) AS user_name
,IF(LENGTH(sex)=0,NULL,sex) AS sex
,IF(LENGTH(phone1)=0,NULL,phone1) AS phone1
,IF(LENGTH(phone2)=0,NULL,phone2) AS phone2
,IF(LENGTH(user_license)=0,NULL,user_license) AS user_license
,IF(LENGTH(user_birth)=0,NULL,user_birth) AS user_birth
,IF(LENGTH(user_email)=0,NULL,user_email) AS user_email
,IF(LENGTH(user_qq)=0,NULL,user_qq) AS user_qq
,IF(LENGTH(user_wx)=0,NULL,user_wx) AS user_wx
,IF(LENGTH(user_addr)=0,NULL,user_addr) AS user_addr
FROM
ODS.ETL_ONE_ID1B
;
4、去除字串中特殊符號
- 目標表:ODS.ETL_ONE_ID3
CREATE TABLE ODS.ETL_ONE_ID3
AS
SELECT
source
,source_level
,regexp_replace(source_pd,'[\\s]+|[\\u3000]+|[\,]','') AS source_pd
,regexp_replace(user_name,'[\\s]+|[\\u3000]+|[\,]','') AS user_name
,regexp_replace(sex,'[\\s]+|[\\u3001]+|[\,]','') AS sex
,regexp_replace(phone1,"[\s+\!\/_,$%^*(+\"\')]+|[\\s]+|[\\u3002]+|[\,]+|[\\\u4E00-\\\u9FA5]+|[::+——()?【】“”!,,?、~@#¥%……&*().-]+", '') AS phone1
,regexp_replace(phone2,"[\s+\!\/_,$%^*(+\"\')]+|[\\s]+|[\\u3002]+|[\,]+|[\\\u4E00-\\\u9FA5]+|[::+——()?【】“”!,,?、~@#¥%……&*().-]+", '') AS phone2
,regexp_replace(user_license,'[\\s]+|[\\u3004]+|[\,]','') AS user_license
,regexp_replace(user_birth,'[\\s]+|[\\u3005]+|[\,]','') AS user_birth
,regexp_replace(user_email,'[\\s]+|[\\u3006]+|[\,]','') AS user_email
,regexp_replace(user_qq,'[\\s]+|[\\u3007]+|[\,]','') AS user_qq
,regexp_replace(user_wx,'[\\s]+|[\\u3008]+|[\,]','') AS user_wx
,regexp_replace(user_addr,'[\\s]+|[\\u3009]+|[\,]','') AS user_addr
FROM
ODS.ETL_ONE_ID2
;
5、正則確保格式正確
- 目標表:ODS.ETL_ONE_ID4
CREATE TABLE ODS.ETL_ONE_ID4
AS
SELECT
source
,source_level
,source_pd
,CASE
WHEN user_name IS NOT NULL
THEN regexp_extract(user_name,'([\\\u4E00-\\\u9FA5]+)',1)
WHEN LENGTH(user_name)=0 THEN NULL
ELSE NULL
END user_name
,sex
,CASE
WHEN phone1 IS NOT NULL AND phone1 regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=TRUE THEN phone1 --座機號清洗
WHEN phone1 IS NOT NULL AND phone1 regexp "^(010|02\\d|0[3-9]\\d{2})\\-?\\d{6,8}$"=TRUE THEN phone1 --座機號清洗
WHEN phone1 IS NOT NULL AND phone1 regexp "^1[3-9]\\d{9}$"=TRUE THEN phone1 --手機號清洗
WHEN phone1 IS NOT NULL AND phone1 regexp "^0+1[3-9]\\d{9}$"=TRUE THEN substr(phone1,-11) --手機號清洗
ELSE NULL
END AS phone1
,CASE
WHEN phone2 IS NOT NULL AND phone2 regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=TRUE THEN phone2 --座機號清洗
WHEN phone2 IS NOT NULL AND phone2 regexp "^(010|02\\d|0[3-9]\\d{2})\\-?\\d{6,8}$"=TRUE THEN phone2 --座機號清洗
WHEN phone2 IS NOT NULL AND phone2 regexp "^1[3-9]\\d{9}$"=TRUE THEN phone2 --手機號清洗
WHEN phone2 IS NOT NULL AND phone2 regexp "^0+1[3-9]\\d{9}$"=TRUE THEN substr(phone2,-11) --手機號清洗
ELSE NULL
END AS phone2
,user_license
,CASE
WHEN user_birth > ADD_MONTHS(CURRENT_DATE,-204) or user_birth <ADD_MONTHS(CURRENT_DATE,-1200) ---年齡大于100歲 或 小于17歲置為null
THEN NULL
ELSE SUBSTR(user_birth,0,10)
END AS user_birth
,user_email
,CASE
WHEN user_qq regexp('([0-9]{5,11})')=true then regexp_extract(user_qq,'([0-9]{5,11})',1)
ELSE NULL
end AS user_qq
,CASE
WHEN user_wx regexp('([a-zA-Z]{1}[-_a-zA-Z0-9]{5,19})')=TRUE
OR user_wx regexp('(([0][3-9]{1}[0-9]{9})|([1][3456789][0-9]{9}))')=TRUE
OR user_wx regexp('([0-9]{5,11})')=true
THEN regexp_extract(user_wx,'(([a-zA-Z]{1}[-_a-zA-Z0-9]{5,19})|([0][3-9]{1}[0-9]{9})|([1][3456789][0-9]{9})|([0-9]{5,11}))',1)
ELSE NULL
END AS user_wx
,user_addr
FROM
ODS.ETL_ONE_ID3
;
6、各欄位清洗邏輯
- 目標表:ODS.ETL_ONE_ID5
注:以下代碼包含了三個自定義函式,本文不進行具體展開:
- idennum( ),身份證校驗
- phones( ),手機號/座機號校驗
- checks( ),企業信用代碼校驗
CREATE TABLE ODS.ETL_ONE_ID5
AS
SELECT
source
,source_level
,source_pd
,IF(LENGTH(user_name)=0,NULL,user_name) AS user_name
,CASE
WHEN LENGTH(user_license)=18
AND default.idennum(user_license) IS NOT NULL
THEN IF(substr(user_license,17,1)%2 = 0,'女','男')--18位身份證提取性別
WHEN LENGTH(user_license)=15
AND default.idennum(user_license) IS NOT NULL
THEN IF(substr(user_license,15,1)%2 = 0,'女','男') --15位身份證提取性別
ELSE sex
END AS sex
,default.phones(phone1) AS phone1
,default.phones(phone2) AS phone2
,CASE
when length(user_license)=18
AND user_license=regexp_extract(user_license,'(^[1-9][0-9]{5}[1-9][0-9]{3}((0[1-9])|(1[0-2]))((0[1-9])|([1|2][0-9])|(3[0|1]))[0-9Xx]{4})',1)
THEN default.idennum(user_license) --18位身份證校驗
when length(user_license)=15
AND default.idennum(user_license) IS NOT NULL
THEN user_license --15位身份證校驗
when default.checks(user_license) is not null
THEN default.checks(user_license) --企業代碼驗證
else null
end as user_license
,CASE
when length(user_license)=18
AND user_license=regexp_extract(user_license,'(^[1-9][0-9]{5}[1-9][0-9]{3}((0[1-9])|(1[0-2]))((0[1-9])|([1|2][0-9])|(3[0|1]))[0-9Xx]{4})',1)
THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(substr(user_license,7, 8),'yyyyMMdd')) AS STRING),0,10)--18位身份證提取生日
when length(user_license)=15
AND default.idennum(user_license) IS NOT NULL
THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(CONCAT('19',SUBSTR(user_license,7, 6)),'yyyyMMdd')) AS STRING),0,10) --15位身份證提取生日
ELSE user_birth
END AS user_birth
,CASE
WHEN user_email IS NOT NULL
AND user_email=regexp_extract(user_email,'([a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+)',1)
THEN user_email
ELSE CONCAT(user_qq,'@qq.com')
END AS user_email
,user_qq
,user_wx
,user_addr
FROM
ODS.ETL_ONE_ID4
;
7、phone1和phone2融合
- 目標表:ODS.ETL_ONE_ID7
- 再次進行“空字串置為null”
CREATE TABLE ODS.ETL_ONE_ID6
AS
SELECT
source
,source_level
,source_pd
,IF(LENGTH(user_name)=0,NULL,user_name) AS user_name --為了避免進行 【同源一機一人】的篩選被篩選掉
,IF(LENGTH(sex)=0,NULL,sex) AS sex
,IF(LENGTH(phone1)=0,NULL,phone1) AS phone1
,IF(LENGTH(phone2)=0,NULL,phone2) AS phone2
,IF(LENGTH(user_license)=0,NULL,user_license) AS user_license
,IF(LENGTH(user_birth)=0,NULL,user_birth) AS user_birth
,IF(LENGTH(user_email)=0,NULL,user_email) AS user_email
,IF(LENGTH(user_qq)=0,NULL,user_qq) AS user_qq
,IF(LENGTH(user_wx)=0,NULL,user_wx) AS user_wx
,IF(LENGTH(user_addr)=0,NULL,user_addr) AS user_addr
FROM
ODS.ETL_ONE_ID5
;
- phone1 和 phone2進行融合
- 注:phone1和phone2有以下三種情況:
- 兩個手機號/座機號是一樣的
- 兩個手機號/座機號不一樣
- 其中一個null,另一個不為null
- 注:phone1和phone2有以下三種情況:
CREATE TABLE ODS.ETL_ONE_ID7
AS
SELECT
A.source
,A.source_level
,A.source_pd
,A.user_name
,A.sex
,A.phone
,A.user_license
,IF(A.user_birth > ADD_MONTHS(CURRENT_DATE,-204) or A.user_birth <ADD_MONTHS(CURRENT_DATE,-1200),NULL,A.user_birth) AS user_birth ---年齡大于100歲 或 小于17歲置為null
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
FROM
(
SELECT
source
,source_level
,source_pd
,user_name
,sex
,phone1 as phone
,user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
FROM
ODS.ETL_ONE_ID6
WHERE phone1 IS NOT NULL
GROUP BY source
,source_level
,source_pd
,user_name
,sex
,phone1
,user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
UNION ALL
SELECT
source
,source_level
,source_pd
,user_name
,sex
,phone2 as phone
,user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
FROM
ODS.ETL_ONE_ID6
WHERE phone2 IS NOT NULL
GROUP BY source
,source_level
,source_pd
,user_name
,sex
,phone2
,user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
) A
GROUP BY A.source
,A.source_level
,A.user_name
,A.source_pd
,A.sex
,A.phone
,A.user_license
,IF(A.user_birth > ADD_MONTHS(CURRENT_DATE,-204) or A.user_birth <ADD_MONTHS(CURRENT_DATE,-1200),NULL,A.user_birth) ---年齡大于100歲 或 小于17歲置為null
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
;
8、同一資料源,確保一個手機號只能對應一個人
- 目標表:ODS.tmp_source_phone_unique1
- 表描述:【同源一機一人】表
CREATE TABLE ODS.tmp_source_phone_unique1 --同一資料源 【同源一機一人】
AS
SELECT
A.source
,A.source_level
,A.source_pd
,A.user_name
,A.sex
,A.phone
,A.user_license
,A.user_birth
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
FROM
ODS.ETL_ONE_ID7 A
JOIN
(
SELECT
*
FROM
(
SELECT
phone
,source
,COUNT(DISTINCT user_name) AS num_uesr_name --同一個手機號,機主數量,基于用戶名字 ,風險:不同數源的用戶名不同,從而會造成資料
,COUNT(DISTINCT user_license) AS num_license --同一個手機號,機主數量,基于用戶證件
FROM
ODS.ETL_ONE_ID7
GROUP BY source
,phone
) B1
WHERE B1.num_uesr_name<=1 OR B1.num_license=1 --確保同一個資料源,【一機一人】,考慮到一個人只有一個稱呼或不填
) B
ON
A.phone=B.phone
AND A.source=B.source
;
9、同一資料源,確保一個身份證只能對應一個人
- 目標表:ODS.tmp_source_phone_unique
- 表描述:【同源一機一人、同源一證一人】表
- 邏輯:同一資料源,確保一個身份證最多只能對應兩臺手機,超過兩臺,該身份證號置為null
CREATE TABLE ODS.tmp_source_phone_unique --同一資料源 【同源一證一人】
AS
SELECT
source
,source_level
,source_pd
,user_name
,sex
,phone
,IF(num_phone_license>2,NULL,user_license) AS user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
FROM
(
SELECT
source
,source_level
,source_pd
,user_name
,sex
,phone
,user_license
,COUNT(DISTINCT phone) OVER(PARTITION BY source,user_license ) AS num_phone_license --同一個資料源一個身份證號對應的手機號數量
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
FROM
ODS.tmp_source_phone_unique1
WHERE default.idennum(user_license) IS NOT NULL
) TT
UNION ALL
SELECT
source
,source_level
,source_pd
,user_name
,sex
,phone
,user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
FROM
ODS.tmp_source_phone_unique1
WHERE default.idennum(user_license) IS NULL
;
七、資料融合
1、給所有用戶造一個“身份證號”phone_id
- phone_id維度表 表結構
CREATE TABLE IF NOT EXISTS dw.dim_phone_id(
phone_id string COMMENT "手機號標識"
,phone string COMMENT "手機號"
,times string COMMENT "手機號在全資料源中的頻次"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
- 生成 phone_id 的三個原則:
- 一個phone_id視為一個人
- 一個phone_id可以有多個手機號
- 一個phone只能對應一個phone_id
第二點原則呼應了“一個人可以有多個手機號”的基本假設,
第三點原則呼應了“一個手機號只能觸達一個人”的基本假設,
- phone_id的取數邏輯:
- 同一資料源同一自增主鍵,有兩個手機號,取手機號出現頻次最高的手機號作為這兩條資料的phone_id
- 全部資料源,確保一個手機號只能對應一個phone_id
- 全部資料源,確保一個phone_id最多只能對應3個手機號,超過3個就將該行資料的phone列作為該行資料的phone_id
- phone_id的作用:
- 資料先基于“身份證號”去融合,其次是“phone_id”
- 生成one_id,優先用“身份證號”進行hash值計算,其次是用“phone_id”進行hash值計算
- 實作步驟:
- 基于全部資料源,統計手機號總共出現在幾個資料源,即“手機號頻次”
CREATE TABLE ods.etl_phone_times --手機號頻次表
AS
SELECT
phone
,COUNT( DISTINCT source) AS times
FROM
ods.tmp_source_phone_unique
GROUP BY phone
;
- 獲取phone_id:【同源一機一人、同源一證一人】表ODS.tmp_source_phone_unique 去聯表 【手機號頻次表】ods.etl_phone_times獲得每個手機號的頻次,然后基于source,source_pd(資料源,該資料源自增主鍵)去分組,取頻次高的手機號作為"phone_id":
- 目標表:ods.etl_phone_id
CREATE TABLE ods.etl_phone_id
AS
SELECT
phone_id
,phone
,times
FROM
(
SELECT
FIRST_VALUE(phone) OVER (partition by source,source_pd ORDER BY times DESC ) AS phone_id --同一資料源,同一主鍵 有兩個不同手機號,以頻次高的手機號作為一個用戶標識 phone_id
,phone
,times
FROM
(
SELECT
A.source
,A.source_pd
,A.phone
,B.times
FROM
ods.tmp_source_phone_unique A
LEFT JOIN
ods.etl_phone_times B
ON A.phone=B.phone
) t011
WHERE phone regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=FALSE --座機號頻次太高,故只取 非座機號去生成 phone_id
UNION ALL
SELECT
phone AS phone_id --座機號作為主鍵一個用戶標識
,phone
,times
FROM
(
SELECT
A.source
,A.source_pd
,A.phone
,B.times
FROM
ods.tmp_source_phone_unique A
LEFT JOIN
ods.etl_phone_times B
ON A.phone=B.phone
) t011
WHERE phone regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=TRUE --篩選出座機號
) t01
GROUP BY phone_id
,phone
,times
;
- 確保一個手機號只對應一個phone_id
- 目標表:ods.etl_phone_id1
CREATE TABLE ods.etl_phone_id1
AS
SELECT
phone_id
,phone
,times
FROM
(
SELECT
IF(num_phone_id>1,phone,phone_id) AS phone_id
,phone
,times
FROM
(
SELECT
phone_id
,phone
,times
,COUNT(DISTINCT phone_id) OVER (PARTITION BY phone ) AS num_phone_id
FROM
ods.etl_phone_id
) t1
) TT
GROUP BY
phone_id
,phone
,times
;
- 確保全部資料源,一個phone_id最多對應3個手機號
- 目標表:ods.etl_phone_id2
CREATE TABLE ods.etl_phone_id2
AS
SELECT
IF(num_phone>3,phone,phone_id) AS phone_id
,phone
,times
FROM
(
SELECT
phone_id
,COUNT(DISTINCT phone) OVER(PARTITION BY phone_id ) AS num_phone
,phone
,times
FROM
ods.etl_phone_id1
) TT
;
- 將資料加載到phone_id維度表
- 目標表:dw.dim_phone_id
INSERT OVERWRITE TABLE dw.dim_phone_id
SELECT
IF(B.phone_id IS NULL,A.phone_id,B.phone_id) AS phone_id
,A.phone
,A.times
FROM
ods.etl_phone_id2 A
LEFT JOIN
dw.dim_phone_id B
ON A.phone=B.phone
;
2、基于資訊可信度,補齊每個手機號的基本資訊
目標表:ods.phone_fuse
CREATE TABLE IF NOT EXISTS ods.phone_fuse(
phone_id string COMMENT "手機號識別成一個人"
,phone string COMMENT "手機號"
,phone_times string COMMENT "手機號在全資料源中的頻次"
,user_name_first string COMMENT "基于表優先級取到的用戶名"
,sex string COMMENT "性別"
,user_license string COMMENT "證件號"
,user_birth string COMMENT "出生日期"
,user_email string COMMENT "郵件地址"
,user_qq string COMMENT "QQ"
,user_wx string COMMENT "微信"
,user_addr string COMMENT "住址"
,source_list string COMMENT "資料源串列"
,source_pd_list string COMMENT "主鍵串列"
)
COMMENT 'ods層--基于手機號基本資訊融合表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
假如同一個手機號,該用戶在“懂車帝”上留的用戶名是“妮可羅賓小可愛”,而他在購車(實銷訂單)的時候留的用戶名是“李建鋼”,你覺得,他在哪里留的資訊可信度比較高?那肯定是購車的時候!
所以基于資訊可信度,我會給資料來源表,分優先級1、2、3…數字越小,可信度越高,
有了這些條件,你怎么去補齊每個手機號的基本資訊呢?我是這樣操作的:
1、以dw.dim_phone_id作為主表,該表的phone是不重復的,作為ods.phone_fuse的主鍵,
SELECT
phone_id
,phone
,times AS phone_times
FROM
cdp_dw.dim_phone_id
2、其他欄位如用戶名、性別、證件號、生日等欄位,我是這樣取的:
以“用戶名”取值為例:
-
基于【同源一機一人、同源一證一人】表ods.tmp_source_phone_unique,篩選出“用戶名”不為null的資料
-
基于步驟1,用以下代碼取出優先級最高的“用戶名”
1. ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN 2. where RN=1
SELECT
*
FROM
(
SELECT
phone
,user_name AS user_name_first
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_name IS NOT NULL AND phone IS NOT NULL
) A21
WHERE A21.RN=1
3、融合完整代碼:
INSERT OVERWRITE TABLE ods.phone_fuse
SELECT
A1.phone_id
,A1.phone
,A1.phone_times
,A2.user_name_first
,nvl(B.sex,'未知') AS sex
,C.user_license
,D.user_birth
,E.user_email
,F.user_qq
,G.user_wx
,H.user_addr
,CONCAT_WS(',',A2.source
,B.source
, C.source
, D.source
, E.source
, F.source
, G.source
, H.source
) AS source_list
,CONCAT_WS(',',A2.source_pd
,B.source_pd
, C.source_pd
, D.source_pd
, E.source_pd
, F.source_pd
, G.source_pd
, H.source_pd
) AS source_pd_list
FROM
(
SELECT
phone_id
,phone
,times AS phone_times
FROM
cdp_dw.dim_phone_id
) A1 --手機號主表 【全源一機頻次】表
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_name AS user_name_first
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_name IS NOT NULL AND phone IS NOT NULL
) A21
WHERE A21.RN=1
) A2 --姓名
ON A1.phone=A2.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,sex
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE sex IS NOT NULL AND sex<>'未知' AND phone IS NOT NULL
) B1
WHERE B1.RN=1
) B --性別
ON A1.phone=B.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_license
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_license IS NOT NULL AND phone IS NOT NULL
) C1
WHERE C1.RN=1
) C --證件號
ON A1.phone=C.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_birth
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_birth IS NOT NULL AND phone IS NOT NULL
) D1
WHERE D1.RN=1
) D --出生日期
ON A1.phone=D.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_email
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_email IS NOT NULL AND phone IS NOT NULL
) E1
WHERE E1.RN=1
) E --郵件地址
ON A1.phone=E.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_qq
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_qq IS NOT NULL AND phone IS NOT NULL
) F1
WHERE F1.RN=1
) F --qq
ON A1.phone=F.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_wx
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_wx IS NOT NULL
) G1
WHERE G1.RN=1
) G --微信
ON A1.phone=G.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,user_addr
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_addr IS NOT NULL AND phone IS NOT NULL
) H1
WHERE H1.RN=1
) H --地址
ON A1.phone=H.phone
;
3、基于“身份證號”補齊每個人的基本資訊
- 目標表:ods.user_license_fuse表結構
CREATE TABLE IF NOT EXISTS ods.user_license_fuse(
phone_id string COMMENT "手機號識別成一個人"
,phone string COMMENT "手機號"
,phone_times INT COMMENT "手機號在全資料源中的頻次"
,user_name_first string COMMENT "同一手機號去表優先級最高的名字"
,user_name string COMMENT "同一證件/手機號名字頻次最高的名字"
,sex string COMMENT "性別"
,user_license string COMMENT "證件號"
,user_birth string COMMENT "出生日期"
,user_email string COMMENT "郵件地址"
,user_qq string COMMENT "QQ"
,user_wx string COMMENT "微信"
,user_addr string COMMENT "住址"
,source_list string COMMENT "資料源串列"
,source_pd_list string COMMENT "主鍵串列"
)
COMMENT 'ods層--基于證件號基本資訊融合表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
- 融合步驟:
- 基于身份證、user_name去統計全部資料源的user_name的頻次
CREATE TABLE ods.user_license_name1
AS
SELECT
user_license
,user_name
,count(user_name) AS user_name_times1
FROM
ods.tmp_source_phone_unique
WHERE user_license IS NOT NULL AND default.idennum(user_license) IS NOT NULL
GROUP BY user_license
,user_name
;
- 基于手機號、user_name去統計user_name的頻次 ,同一手機號取名字頻次高的名字
CREATE TABLE odsb.user_name_times2
AS
SELECT
phone
,user_name
,count(user_name) AS user_name_times2
FROM
odsb.tmp_source_phone_unique
WHERE user_name IS NOT NULL AND default.idennum(user_license) IS NULL
GROUP BY phone
,user_name
;
- 基于身份證號 或 phone_id 去融合個人資訊
INSERT OVERWRITE TABLE ods_g063_grt_all_db.user_license_fuse
SELECT
A.phone_id
,A.phone
,A.phone_times
,A.user_name_first
,A.user_name --同一證件號頻次高的名字
,CASE
WHEN LENGTH(A.user_license)=18
THEN IF(substr(A.user_license,17,1)%2 = 0,'女','男')--18位身份證提取性別
WHEN LENGTH(A.user_license)=15
THEN IF(substr(A.user_license,15,1)%2 = 0,'女','男') --15位身份證提取性別
ELSE A.sex
END AS sex
,A.user_license
,CASE
when length(A.user_license)=18
THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(substr(A.user_license,7, 8),'yyyyMMdd')) AS STRING),0,10)--18位身份證提取生日
when length(A.user_license)=15
THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(CONCAT('19',SUBSTR(A.user_license,7, 6)),'yyyyMMdd')) AS STRING),0,10) --15位身份證提取生日
ELSE A.user_birth
END AS user_birth
,IF(A.user_email IS NULL
,FIRST_VALUE(A.user_email) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_email ,'')) DESC)
,A.user_email
)AS user_email
,IF(A.user_qq IS NULL
,FIRST_VALUE(A.user_qq ) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_qq ,'')) DESC)
,A.user_qq
)AS user_qq
,IF(A.user_wx IS NULL
,FIRST_VALUE(A.user_wx) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_wx,'')) DESC)
,A.user_wx
)AS user_wx
,IF(A.user_addr IS NULL
,FIRST_VALUE(A.user_addr) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_addr,'')) DESC)
,A.user_addr
) AS user_addr
,A.source_list
,A.source_pd_list
FROM
(
SELECT
A.phone_id
,A.phone
,A.phone_times
,A.user_name_first
,B.user_name
,A.sex
,A.user_license
,A.user_birth
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
,A.source_list
,A.source_pd_list
FROM
(
SELECT
*
FROM
ods_g063_grt_all_db.phone_fuse
WHERE default.idennum(user_license) IS NOT NULL
) A
LEFT JOIN
--同一證件號取頻次最高的名字
( SELECT
*
FROM
(
SELECT
user_license
,user_name
,row_number() over (partition by user_license order by user_name_times1 desc ) AS RN
FROM
ods_g063_grt_all_db.user_license_name1
) B1
WHERE RN=1
) B
ON A.user_license=B.user_license
) A --有證件,同一證件,user_name 取頻次最高的頻次、其他欄位取不為空的
UNION ALL
--【一人多機】資訊融合:一個人的多個手機號資訊進行融合
SELECT
B.phone_id
,B.phone
,B.phone_times
,B.user_name_first
,B.user_name --同一手機號取頻次高的名字
,IF(B.sex IS NULL OR B.sex='未知'
,FIRST_VALUE(B.sex) OVER(PARTITION BY B.phone_id ORDER BY IF(LENGTH(NVL(B.sex,''))>0,LENGTH(NVL(B.sex,'')),3) )
,B.sex
) AS sex
,B.user_license
,IF(B.user_birth IS NULL
,FIRST_VALUE(B.user_birth) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_birth,'')) DESC)
,B.user_birth
) AS user_birth
,IF (B.user_email IS NULL
,FIRST_VALUE(B.user_email) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_email,'')) DESC)
,B.user_email
) AS user_email
,IF(B.user_qq IS NULL
,FIRST_VALUE(B.user_qq ) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_qq ,'')) DESC)
,B.user_qq
) AS user_qq
,IF (B.user_wx IS NULL
,FIRST_VALUE(B.user_wx ) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_wx ,'')) DESC)
,B.user_wx
) AS user_wx
,IF (B.user_addr IS NULL
,FIRST_VALUE(B.user_addr) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_addr ,'')) DESC)
,B.user_addr
) AS user_addr
,B.source_list
,B.source_pd_list
FROM
(
SELECT
A.phone_id
,A.phone
,A.phone_times
,A.user_name_first
,B.user_name
,A.sex
,A.user_license
,A.user_birth
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
,A.source_list
,A.source_pd_list
FROM
(
SELECT
*
FROM
ods_g063_grt_all_db.phone_fuse
WHERE default.idennum(user_license) IS NULL
) A --這個表已經確保表手機號是唯一的了
LEFT JOIN
--同一手機號標識取 頻次最高的名字
( SELECT
*
FROM
(
SELECT
phone
,user_name
,row_number() over (partition by phone order by user_name_times2 desc ) AS RN
FROM
ods_g063_grt_all_db.user_name_times2
) B1
WHERE RN=1
) B
ON A.phone=B.phone
) B --無證件,同一手機號,user_name去頻次最高的那個
;
八、生成one_id
1、one_id初始化
- 目標表 表結構
CREATE TABLE IF NOT EXISTS dw.dim_one_id_initialize(
one_id string COMMENT "ONE_ID"
,user_license string COMMENT "證件號/phone_id"
,phone string COMMENT "手機號/座機號"
,phone_times int COMMENT "手機號頻次"
)
COMMENT 'dw層--one_id初始化表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
-
生成ond_id :最初one_id是基于手機號、座機號、身份證,進行hash()計算的
-
警惕:hash會碰撞(不同的取數,卻計算出來同樣的hash值)
-
碰撞概率:
- hash():2的-32次方
(計算結果約等1/42億的概率會重復) - md5():2的-256次方
(計算結果約等1/42億的8次方的概率會重復)
- hash():2的-32次方
-
為了獲得唯一one_id,我們采用了簡單的“凱撒加密”,“凱撒加密”java代碼如下:
package cn.ysw.com.aspect;
import cn.hutool.core.util.StrUtil;
import java.util.HashMap;
import java.util.Map;
public class Test {
private static Map<String,String> MAP = new HashMap<>();
static {
MAP.put("0","3");
MAP.put("1","4");
MAP.put("2","5");
MAP.put("3","6");
MAP.put("4","7");
MAP.put("5","8");
MAP.put("6","9");
MAP.put("7","0");
MAP.put("8","1");
MAP.put("9","2");
MAP.put("x","y");
MAP.put("X","Z");
}
public static String getId(String str) {
if (StrUtil.isBlank(str)) {
return "";
}
char[] chars = str.toCharArray();
StringBuffer sb = new StringBuffer();
for (char a: chars) {
String s = String.valueOf(a);
String val = MAP.get(s);
if (StrUtil.isBlank(val)) {
sb.append(a);
} else {
sb.append(val);
}
}
return sb.toString();
}
public static void main(String[] args) {
String id = getId();
System.out.println(id);
}
}
- 如果不想麻煩可以直接用MD5( ),畢竟計算結果重復的概率是 1/42億的8次方
INSERT OVERWRITE TABLE dw.dim_one_id_initialize
SELECT
md5(A.user_license) AS ONE_ID
,A.user_license
,A.phone
,A.phone_times
FROM
(
SELECT
user_license
,phone
,phone_times
FROM
ods.user_license_fuse
WHERE default.idennum(user_license) IS NOT NULL --有身份證號的資料
UNION ALL
SELECT
phone_id AS user_license
,phone
,phone_times
FROM
ods.user_license_fuse
WHERE default.idennum(user_license) IS NULL --無身份證號,有企業信用代碼 的資料
) A
;
2、one_id角色表
–ONE-ID 角色表
–1:線索
–2:潛客
–3:車主
- 目標表 表結構
CREATE TABLE IF NOT EXISTS dw.dim_one_id_role(
phone string COMMENT "手機號"
,role_id bigint COMMENT "角色編碼"
,role_name string COMMENT "角色名稱"
)
COMMENT 'dw層--one_id角色表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
-
這個角色判斷,是按手機號來源表去判斷的:

-
插入資料:one_id角色表
INSERT OVERWRITE TABLE dw.dim_one_id_role
SELECT
phone
,role_id
,role_name
FROM
(
SELECT
phone
,CASE
WHEN SOURCE IN ('易車網用戶表','懂車帝用戶表') THEN 1
WHEN SOURCE IN ('某4S店訪客登記表') THEN 2
WHEN SOURCE IN ('某4S店銷售訂單表','某汽車APP車主認證表') THEN 3
ELSE NULL
END AS role_id
,CASE
WHEN SOURCE IN ('易車網用戶表','懂車帝用戶表') THEN '線索'
WHEN SOURCE IN ('某4S店訪客登記表') THEN '潛客'
WHEN SOURCE IN ('某4S店銷售訂單表','某汽車APP車主認證表') THEN '車主'
ELSE NULL
END AS role_name
FROM
ods.tmp_source_phone_unique
) TT
GROUP BY phone
,role_id
,role_name
;
- 如果同一個手機號,同時擁有3個角色或2個角色,取“角色代碼”最大的
SELECT
*
FROM
(
SELECT
phone
,role_id
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY role_id desc) AS RN
,role_name AS one_id_role
FROM
dw.dim_one_id_role
) C1
WHERE C1.RN=1
3、one_id拉鏈表
- 表結構
CREATE TABLE IF NOT EXISTS dw.dim_one_id_all(
sk_id bigint COMMENT "自增主鍵"
,phone string COMMENT "該表唯一手機號"
,one_id string COMMENT "用戶唯一標識"
,nature string COMMENT "屬性:個人/組織"
,one_id_role string COMMENT "one-id角色:線索、潛客、車主"
,user_name_first string COMMENT "同一手機號取表優先級最高的名字"
,user_name string COMMENT "同一證件/手機號名字頻次最高的名字"
,sex string COMMENT "性別"
,user_license string COMMENT "證件號"
,user_birth string COMMENT "出生日期"
,user_email string COMMENT "郵件地址"
,user_qq string COMMENT "QQ"
,user_wx string COMMENT "微信"
,user_addr string COMMENT "住址"
,source_list string COMMENT "資料源串列"
,source_pd_list string COMMENT "主鍵串列"
,valid_start_dt string COMMENT "當前行生效日期"
,valid_end_dt string COMMENT "當前行失效日期"
)
COMMENT 'dw層--one_id拉鏈表'
PARTITIONED BY (PART_DAY STRING COMMENT '磁區欄位,當天系統時間')
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
- 拉鏈表更新
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.strict.checks.type.safety=false;
INSERT OVERWRITE TABLE dw.dim_one_id_all
PARTITION(PART_DAY)
SELECT
ROW_NUMBER() OVER (ORDER BY CONCAT_WS('_',t1.phone,cast(t1.one_id as string))) + max_sk AS sk_id
,t1.phone
,t1.one_id
,t1.nature
,t1.one_id_role
,t1.user_name_first
,t1.user_name
,t1.sex
,t1.user_license
,t1.user_birth
,t1.user_email
,t1.user_qq
,t1.user_wx
,t1.user_addr
,t1.source_list
,t1.source_pd_list
, DATE_FORMAT(CURRENT_DATE,'yyyyMMdd') AS valid_start_dt
, '99991231' AS valid_end_dt
,CAST(CURRENT_DATE AS STRING) AS PART_DAY
FROM
(
SELECT
B.ONE_ID
,A.phone
,CASE
WHEN default.checks(A.user_license) IS NOT NULL
AND A.user_birth IS NULL
THEN '組織'
WHEN SUBSTR(A.user_name_first,-1) IN ('司','處','局','館')
THEN '組織'
WHEN SUBSTR(user_name,-1) IN ('司','處','局','館')
THEN '組織'
ELSE '個人'
END AS nature --one-id屬性: 證件號經過 企業代碼驗證 不為null,且生日 為null(因為生日優先用證件號去判斷),則判斷 為"組織"
,C.one_id_role
,A.user_name_first
,A.user_name
,A.sex
,A.user_license
,A.user_birth
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
,A.source_list
,A.source_pd_list
FROM
ods.user_license_fuse A
LEFT JOIN
dw.dim_one_id_initialize B --ONE_ID初始化表
ON A.phone = B.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,role_id
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY role_id desc) AS RN
,role_name AS one_id_role
FROM
dw.dim_one_id_role
) C1
WHERE C1.RN=1
) C --角色表
ON A.phone=C.phone
) t1 --源資料
LEFT JOIN
(
SELECT
*
FROM
dw.dim_one_id_all
WHERE valid_start_dt <= DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') --只取有效的資料
AND valid_end_dt > DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd')
AND PART_DAY=CAST(DATE_SUB(CURRENT_DATE,1) AS STRING) --取昨天的磁區
)t2
ON t1.phone=t2.phone
--獲取維度表上次最大的外鍵,以便為新資料造新的外鍵
CROSS JOIN
(
SELECT
COALESCE(MAX(sk_id ),0) AS max_sk
FROM
dw.dim_one_id_all
)dim_sk_tmp
WHERE t2.sk_id is null
OR t1.one_id !=t2.one_id
OR t1.nature !=t2.nature
OR t1.one_id_role != t2.one_id_role
OR t1.user_name_first!=t2.user_name_first
OR t1.user_name !=t2.user_name
OR t1.sex !=t2.sex
OR t1.user_license !=t2.user_license
OR t1.user_birth !=t2.user_birth
OR t1.user_email !=t2.user_email
OR t1.user_qq !=t2.user_qq
OR t1.user_wx !=t2.user_wx
UNION ALL
SELECT
t3.sk_id
,t3.phone
,t3.one_id
,t3.nature
,t3.one_id_role
,t3.user_name_first
,t3.user_name
,t3.sex
,t3.user_license
,t3.user_birth
,t3.user_email
,t3.user_qq
,t3.user_wx
,t3.user_addr
,t3.source_list
,t3.source_pd_list
,t3.valid_start_dt
,CASE
WHEN t3.one_id !=t4.one_id
OR t3.nature !=t4.nature
OR t3.one_id_role !=t4.one_id_role
OR t3.user_name_first!=t4.user_name_first
OR t3.user_name !=t4.user_name
OR t3.sex !=t4.sex
OR t3.user_license !=t4.user_license
OR t3.user_birth !=t4.user_birth
OR t3.user_email !=t4.user_email
OR t3.user_qq !=t4.user_qq
OR t3.user_wx !=t4.user_wx
THEN DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd')
ELSE t3.valid_end_dt
END AS valid_end_dt
,CAST(CURRENT_DATE AS STRING) AS PART_DAY
FROM
(
SELECT
*
FROM
dw.dim_one_id_all
WHERE valid_start_dt <= DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') --只取有效的資料
AND valid_end_dt > DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd')
AND PART_DAY=CAST(DATE_SUB(CURRENT_DATE,1) AS STRING) --取昨天的磁區
)t3 --維度表
LEFT JOIN
(
SELECT
B.ONE_ID
,A.phone
,CASE
WHEN default.checks(A.user_license) IS NOT NULL
AND A.user_birth IS NULL
THEN '組織'
WHEN SUBSTR(A.user_name_first,-1) IN ('司','處','局','館')
THEN '組織'
WHEN SUBSTR(user_name,-1) IN ('司','處','局','館')
THEN '組織'
ELSE '個人'
END AS nature --one-id屬性: 證件號經過 企業代碼驗證 不為null,且生日 為null(因為生日優先用證件號去判斷),則判斷 為"組織"
,C.one_id_role
,A.user_name_first
,A.user_name
,A.sex
,A.user_license
,A.user_birth
,A.user_email
,A.user_qq
,A.user_wx
,A.user_addr
,A.source_list
,A.source_pd_list
FROM
ods.user_license_fuse A
LEFT JOIN
dw.dim_one_id_initialize B --ONE_ID初始化表
ON A.phone = B.phone
LEFT JOIN
(
SELECT
*
FROM
(
SELECT
phone
,role_id
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY role_id desc) AS RN
,role_name AS one_id_role
FROM
dw.dim_one_id_role
) C1
WHERE C1.RN=1 --同個手機號有多個角色,取序列號最大的角色
) C --角色表
ON A.phone=C.phone
) t4 --源資料
ON t3.phone=t4.phone
UNION ALL
--
SELECT
t4.sk_id
,t4.phone
,t4.one_id
,t4.nature
,t4.one_id_role
,t4.user_name_first
,t4.user_name
,t4.sex
,t4.user_license
,t4.user_birth
,t4.user_email
,t4.user_qq
,t4.user_wx
,t4.user_addr
,t4.source_list
,t4.source_pd_list
,t4.valid_start_dt
,t4.valid_end_dt
,CAST(CURRENT_DATE AS STRING) AS PART_DAY
FROM
dw.dim_one_id_all t4
WHERE valid_end_dt <= DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd')
AND PART_DAY=CAST(DATE_SUB(CURRENT_DATE,1) AS STRING) --取昨天的磁區
;
4、one_id應用表
- 表結構
CREATE TABLE IF NOT EXISTS dw.dim_one_id(
sk_id bigint COMMENT "自增主鍵"
,phone string COMMENT "該表唯一手機號"
,one_id string COMMENT "用戶唯一標識"
,nature string COMMENT "屬性:個人/組織"
,one_id_role string COMMENT "one-id角色:線索、潛客、車主"
,user_name_first string COMMENT "同一手機號取表優先級最高的名字"
,user_name string COMMENT "同一證件/手機號名字頻次最高的名字"
,sex string COMMENT "性別"
,user_license string COMMENT "證件號"
,user_birth string COMMENT "出生日期"
,user_email string COMMENT "郵件地址"
,user_qq string COMMENT "QQ"
,user_wx string COMMENT "微信"
,user_addr string COMMENT "住址"
,source_list string COMMENT "資料源串列"
,source_pd_list string COMMENT "主鍵串列"
)
COMMENT 'dw層--one_id維度表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
- 插入資料:基于one_id拉鏈表,取最新且有效的one_id資料
INSERT OVERWRITE TABLE dw.dim_one_id
SELECT
sk_id
,phone
,one_id
,nature
,one_id_role
,user_name_first
,user_name
,sex
,user_license
,user_birth
,user_email
,user_qq
,user_wx
,user_addr
,source_list
,source_pd_list
FROM
dw.dim_one_id_all
WHERE valid_start_dt <= DATE_FORMAT(CURRENT_DATE,'yyyyMMdd') --只取有效的資料
AND valid_end_dt > DATE_FORMAT(CURRENT_DATE,'yyyyMMdd')
AND PART_DAY=CAST(CURRENT_DATE AS STRING)
;
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/278181.html
標籤:其他
上一篇:Linux——(1)基本命令
