【NLP】??學不會打我! 半小時學會基本操作 14?? 一百行實作 bert 二分類
- 概述
- Bert
- 簡單說一說
- 100 行實作 Bert
- 網路架構
- 超引數
- get_data
- main
- 完整代碼
概述
從今天開始我們將開啟一段自然語言處理 (NLP) 的旅程. 自然語言處理可以讓來處理, 理解, 以及運用人類的語言, 實作機器語言和人類語言之間的溝通橋梁.

Bert
Bert (Bidirecrional Encoder Representation from Transformers) 是一個預訓練的語言表征模型. Bert 主要利用了 Transformer 的 Encoder 結構, 這里就不多贅述.

簡單說一說
在大家的鼓勵下這屆的 CCF 榮獲 4 個冠軍, 1 個亞軍, 在天池中榮獲第 4 名. 如圖:





100 行實作 Bert
以下代碼是全網最簡單的 Bert 實作, 部分為比賽原始碼.
網路架構
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 780)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, 780)] 0
__________________________________________________________________________________________________
tf_bert_model (TFBertModel) TFBaseModelOutputWit 102267648 input_1[0][0]
input_2[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 2) 7690 tf_bert_model[0][1]
==================================================================================================
Total params: 102,275,338
Trainable params: 102,275,338
Non-trainable params: 0
__________________________________________________________________________________________________
超引數
# 超引數
EPOCHS = 50 # 迭代次數
BATCH_SIZE = 8 # 單詞訓練樣本數目
learning_rate = 0.00003 # 學習率
INPUT_DIM = 36782 + 1
MAX_LENGTH = 780
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 優化器
loss = tf.keras.losses.CategoricalCrossentropy() # 損失
bert_tokenizer = BertTokenizer.from_pretrained('Langboat/mengzi-bert-base') # Bert的分詞器
get_data
def get_data():
"""
讀取資料
:return: 回傳分批完的訓練集和測驗集
"""
# 讀取資料
data_train = pd.read_csv("../data/train.csv")
print(data_train.head(), data_train.shape)
data_val = pd.read_csv("../data/val.csv")
print(data_val.head(), data_val.shape)
# 預處理
data_train["label"] = data_train["label"].apply(lambda x: x.split(","))
print(data_train.head())
data_val["label"] = data_val["label"].apply(lambda x: x.split(","))
print(data_val.head())
# 獲取X,y
X_train = data_train["text"].values.tolist()
y_train = data_train["label"].values.tolist()
y_train = np.asarray(y_train, dtype=np.float32)
X_val = data_val["text"].values.tolist()
y_val = data_val["label"].values.tolist()
y_val = np.asarray(y_val, dtype=np.float32)
# Tokenizer
X_train = bert_tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH)
X_val = bert_tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH)
print("=" * 20, "字個數:", bert_tokenizer.vocab_size, "=" * 20)
# 獲取input/mask
train_input = X_train["input_ids"]
train_mask = X_train["attention_mask"]
train_input = np.asarray(train_input)
train_mask = np.asarray(train_mask)
val_input = X_val["input_ids"]
val_mask = X_val["attention_mask"]
val_input = np.asarray(val_input)
val_mask = np.asarray(val_mask)
return train_input, val_input, train_mask, val_mask, y_train, y_val
main
def main():
# 獲取資料
X_train_input, X_test_input, X_train_mask, X_test_mask, y_train, y_test = get_data()
# 除錯輸出
print(X_train_input[:5], X_train_input.shape)
print(X_test_input[:5], X_test_input.shape)
print(X_train_mask[:5], X_train_mask.shape)
print(X_test_mask[:5], X_test_mask.shape)
print(y_train[:5], y_train.shape)
print(y_test[:5], y_test.shape)
# Bert模型
bert = TFBertModel.from_pretrained("Langboat/mengzi-bert-base", from_pt=True)
input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
masks = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
bert = bert([input_ids, masks])
bert = bert[1]
classifier = Dense(2, activation="softmax")(bert)
# 模型
model = Model(inputs=[input_ids, masks], outputs=classifier)
print(model.summary())
# 組合
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
# 保存
checkpoint = tf.keras.callbacks.ModelCheckpoint(
"../model/bert_mengzi/bert_mengzi.ckpt", monitor='val_loss',
verbose=1, save_best_only=True, mode='min',
save_weights_only=True
)
# 訓練
model.fit([X_train_input, X_train_mask], y_train, validation_data=([X_test_input, X_test_mask], y_test),
epochs=EPOCHS, batch_size=BATCH_SIZE,
callbacks=[checkpoint])
完整代碼
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras import Model
from tensorflow.keras.layers import Dense
from transformers import BertTokenizer, TFBertModel
# 超引數
EPOCHS = 50 # 迭代次數
BATCH_SIZE = 8 # 單詞訓練樣本數目
learning_rate = 0.00003 # 學習率
INPUT_DIM = 36782 + 1
MAX_LENGTH = 780
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 優化器
loss = tf.keras.losses.CategoricalCrossentropy() # 損失
bert_tokenizer = BertTokenizer.from_pretrained('Langboat/mengzi-bert-base') # Bert的分詞器
def get_data():
"""
讀取資料
:return: 回傳分批完的訓練集和測驗集
"""
# 讀取資料
data_train = pd.read_csv("../data/train.csv")
print(data_train.head(), data_train.shape)
data_val = pd.read_csv("../data/val.csv")
print(data_val.head(), data_val.shape)
# 預處理
data_train["label"] = data_train["label"].apply(lambda x: x.split(","))
print(data_train.head())
data_val["label"] = data_val["label"].apply(lambda x: x.split(","))
print(data_val.head())
# 獲取X,y
X_train = data_train["text"].values.tolist()
y_train = data_train["label"].values.tolist()
y_train = np.asarray(y_train, dtype=np.float32)
X_val = data_val["text"].values.tolist()
y_val = data_val["label"].values.tolist()
y_val = np.asarray(y_val, dtype=np.float32)
# Tokenizer
X_train = bert_tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH)
X_val = bert_tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH)
print("=" * 20, "字個數:", bert_tokenizer.vocab_size, "=" * 20)
# 獲取input/mask
train_input = X_train["input_ids"]
train_mask = X_train["attention_mask"]
train_input = np.asarray(train_input)
train_mask = np.asarray(train_mask)
val_input = X_val["input_ids"]
val_mask = X_val["attention_mask"]
val_input = np.asarray(val_input)
val_mask = np.asarray(val_mask)
return train_input, val_input, train_mask, val_mask, y_train, y_val
def main():
# 獲取資料
X_train_input, X_test_input, X_train_mask, X_test_mask, y_train, y_test = get_data()
# 除錯輸出
print(X_train_input[:5], X_train_input.shape)
print(X_test_input[:5], X_test_input.shape)
print(X_train_mask[:5], X_train_mask.shape)
print(X_test_mask[:5], X_test_mask.shape)
print(y_train[:5], y_train.shape)
print(y_test[:5], y_test.shape)
# Bert模型
bert = TFBertModel.from_pretrained("Langboat/mengzi-bert-base", from_pt=True)
input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
masks = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
bert = bert([input_ids, masks])
bert = bert[1]
classifier = Dense(2, activation="softmax")(bert)
# 模型
model = Model(inputs=[input_ids, masks], outputs=classifier)
print(model.summary())
# 組合
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
# 保存
checkpoint = tf.keras.callbacks.ModelCheckpoint(
"../model/bert_mengzi/bert_mengzi.ckpt", monitor='val_loss',
verbose=1, save_best_only=True, mode='min',
save_weights_only=True
)
# 訓練
model.fit([X_train_input, X_train_mask], y_train, validation_data=([X_test_input, X_test_mask], y_test),
epochs=EPOCHS, batch_size=BATCH_SIZE,
callbacks=[checkpoint])
if __name__ == '__main__':
main()
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/381905.html
標籤:AI
上一篇:改進的PID演算法
