一、安裝datax
通過https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202210/datax.tar.gz這個地址下載datax.tar.gz,解壓到某個目錄,如我的目錄/opt/conf/datax-20230301,解壓完后會在當前目錄下生成datax目錄,進入datax目錄后的目錄如下圖所示:

之后在datax安裝目錄下,運行以下命令,賦予執行權限,
sudo chmod -R 755 ./*
二、測驗datax是否正確安裝
/opt/conf/datax-20230301/datax/bin/datax.py /opt/conf/datax-20230301/datax/job/job.json
運行以上命令,看是否能正確啟動,啟動后運行完結果如下圖:

如果那個正確運行,說明/opt/conf/datax-20230301/datax/bin/datax.py這個檔案的編碼不是utf-8,需要重新編碼,用我這個替換一下即可正常使用,
datax.py
三、撰寫組態檔
在datax安裝目錄下的job檔案夾,使用以下命令新建組態檔
vim job_air_data_source_mysql_hdfs.json
之后將下面的json檔案內容拷貝粘貼到剛才打開的檔案,保存即可,
{
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "root",
"password": "root",
"column": ["*"],
"splitPk": "id",
"connection": [
{
"table": [
"air_data_source"
],
"jdbcUrl": [
"jdbc:mysql://master:3306/air_data"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://master:9820",
"fileType": "TEXT",
"path": "/user/hive/warehouse/air_data.db/air_data_source",
"fileName": "air_data_source_202302",
"column": [
{"name": " id ","type": "STRING"},
{"name": " airlinelogo ","type": "STRING"},
{"name": " airlineshortcompany ","type": "STRING"},
{"name": " arractcross ","type": "STRING"},
{"name": " arracttime ","type": "STRING"},
{"name": " arrairport ","type": "STRING"},
{"name": " arrcode ","type": "STRING"},
{"name": " arrontimerate ","type": "STRING"},
{"name": " arrplancross ","type": "STRING"},
{"name": " arrplantime ","type": "STRING"},
{"name": " arrterminal ","type": "STRING"},
{"name": " checkintable ","type": "STRING"},
{"name": " checkintablewidth ","type": "STRING"},
{"name": " depactcross ","type": "STRING"},
{"name": " depacttime ","type": "STRING"},
{"name": " depairport ","type": "STRING"},
{"name": " depcode ","type": "STRING"},
{"name": " depplancross ","type": "STRING"},
{"name": " depplantime ","type": "STRING"},
{"name": " depterminal ","type": "STRING"},
{"name": " flightno ","type": "STRING"},
{"name": " flightstate ","type": "STRING"},
{"name": " localdate ","type": "STRING"},
{"name": " mainflightno ","type": "STRING"},
{"name": " shareflag ","type": "STRING"},
{"name": " statecolor ","type": "STRING"}
],
"writeMode": "truncate",
"fieldDelimiter": "\u0001",
"compress":"GZIP"
}
}
}
]
}
}
四、Hive建資料庫、資料表
create database air_data;
use air_data;
CREATE TABLE `air_data_source`(
`id` int COMMENT '主鍵',
`airlinelogo` string COMMENT '航空公司logo',
`airlineshortcompany` string COMMENT '航空公司簡稱',
`arractcross` string,
`arracttime` string COMMENT '實際起飛時間',
`arrairport` string,
`arrcode` string,
`arrontimerate` string COMMENT '到達準點率',
`arrplancross` string,
`arrplantime` string COMMENT '計劃到達時間',
`arrterminal` string,
`checkintable` string,
`checkintablewidth` string,
`depactcross` string,
`depacttime` string COMMENT '實際到達時間',
`depairport` string COMMENT '到達機場名稱',
`depcode` string COMMENT '到達機場代碼',
`depplancross` string,
`depplantime` string COMMENT '計劃起飛時間',
`depterminal` string,
`flightno` string COMMENT '航班號',
`flightstate` string COMMENT '航班狀態',
`localdate` string,
`mainflightno` string,
`shareflag` string,
`statecolor` string)
COMMENT '航空資料原始表'
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
;
運行完以上任務后,接著可以進行資料抽取了,
四、運行任務
在當前目錄下執行以下命令:
/opt/conf/datax-20230301/datax/bin/datax.py /opt/conf/datax-20230301/datax/job/job_air_data_source_mysql_hdfs.json
即可正確啟動資料同步任務,運行完結果如下:

查看HDFS上是否已經有了資料檔案,運行一下命令,得到輸出,
hadoop fs -ls hdfs://master:9820/user/hive/warehouse/air_data.db/air_data_source

至此,利用datax將mysql資料同步到hdfs任務已配置完成,
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/545991.html
標籤:大數據
上一篇:資料問題排查思路
下一篇:資料漂移問題及解決方案
