tensorflow2.0 + transformers EWECT微博情緒分類大賽第三名Bert-Last_3embedding_concat最優單模型復現
前言
最近正在實現網易云評論情緒分類,用于評論社區研究,在搜索相關比賽的實現方法,看到了為數不多的單模型也能達到較好效果的情況,因此拿來復現作為第一版模型。
復現模型:微博情緒分析評測(smp2020-ewect)No.3 拿第一導師請吃肯德基 usual語料部分情緒分類最優單模型。
模型結構:
代碼部分
tensorflow : 2.0
transformers : 3.1.0
話不多說直接上代碼
import numpy
as np
import pandas
as pd
import tensorflow
as tf
from sklearn
import model_selection
from transformers
import *
from tokenizers
import BertWordPieceTokenizer
, ByteLevelBPETokenizer
from sklearn
.metrics
import f1_score
from sklearn
.model_selection
import train_test_split
import os
import tensorflow
.keras
.backend
as K
class BertNerModel(TFBertPreTrainedModel
):dense_layer
= 512class_num
= 6drop_out_rate
= 0.5def __init__(self
, config
, *inputs
, **kwargs
):super(BertNerModel
,self
).__init__
(config
, *inputs
, **kwargs
)config
.output_hidden_states
= Trueself
.bert_layer
= TFBertMainLayer
(config
, name
='bert')self
.bert_layer
.trainable
= Trueself
.liner_layer
= tf
.keras
.layers
.Dense
(self
.dense_layer
,activation
='relu')self
.soft_max
= tf
.keras
.layers
.Dense
(self
.class_num
,activation
='softmax')self
.drop_out
= tf
.keras
.layers
.Dropout
(self
.drop_out_rate
)def call(self
, inputs
):hidden_states
= self
.bert_layer
(inputs
)tensor
= tf
.concat
((hidden_states
[2][-1][:,0],hidden_states
[2][-2][:,0],hidden_states
[2][-3][:,0],hidden_states
[1]),1,)drop_out_l
= self
.drop_out
(tensor
)Dense_l
= self
.liner_layer
(drop_out_l
)outputs
= self
.soft_max
(Dense_l
)return outputs
def encode_(x
,y
,tokenizer
):train_texts
, val_texts
, train_tags
, val_tags
= train_test_split
(x
,y
,test_size
=0.2,random_state
=888)batch_x1
= tokenizer
(train_texts
, padding
=True, truncation
=True, return_tensors
="tf",max_length
=60)batch_x2
= tokenizer
(val_texts
, padding
=True, truncation
=True, return_tensors
="tf",max_length
=60)label_1
= tf
.constant
(train_tags
)label_2
= tf
.constant
(val_tags
)dataset_train
= tf
.data
.Dataset
.from_tensor_slices
((dict(batch_x1
),label_1
))dataset_test
= tf
.data
.Dataset
.from_tensor_slices
((dict(batch_x2
),label_2
))return dataset_train
,dataset_test
class Metrics(tf
.keras
.callbacks
.Callback
):'''定義了一個callbcacks方法,可以在每輪訓練結束后計算模型在測試集上的F1_score值,并保存f1_score最大的模型。(選用)'''def __init__(self
, valid_data
):super(Metrics
, self
).__init__
()self
.validation_data
= valid_data
def on_train_begin(self
, logs
=None):self
.val_f1s
= []self
.best_val_f1
= 0def on_epoch_end(self
, epoch
, logs
=None):logs
= logs
or {}val_predict
= np
.argmax
(self
.model
(self
.validation_data
[0]), -1)val_targ
= self
.validation_data
[1]_val_f1
= f1_score
(val_targ
, val_predict
, average
='macro')self
.val_f1s
.append
(_val_f1
)logs
['val_f1'] = _val_f1
if _val_f1
> self
.best_val_f1
:self
.model
.save_pretrained
('./checkpoints/weights-f1={}/'.format(_val_f1
), overwrite
=True)self
.best_val_f1
= _val_f1
print("best f1: {}".format(self
.best_val_f1
))else:print("val f1: {}, but not the best f1".format(_val_f1
))returndef focal_loss(label
,pred
,class_num
=6, gamma
=2):'''多分類的focal_loss,暫時沒有提升效果(選用)'''label
= tf
.squeeze
(tf
.cast
(tf
.one_hot
(tf
.cast
(label
,tf
.int32
),class_num
),pred
.dtype
)) pred
= tf
.clip_by_value
(pred
, 1e-8, 1.0)w1
= tf
.math
.pow((1.0-pred
),gamma
)L
= - tf
.math
.reduce_sum
(w1
* label
* tf
.math
.log
(pred
))return L
def sparse_categorical_crossentropy(y_true
, y_pred
):y_true
= tf
.reshape
(y_true
, tf
.shape
(y_pred
)[:-1])y_true
= tf
.cast
(y_true
, tf
.int32
)y_true
= tf
.one_hot
(y_true
, K
.shape
(y_pred
)[-1])return tf
.keras
.losses
.categorical_crossentropy
(y_true
, y_pred
)def loss_with_gradient_penalty(model
,epsilon
=1):'''對抗訓練FGM懲罰梯度損失函數,能提升1個點左右(選用)'''def loss_with_gradient_penalty_2(y_true
, y_pred
):loss
= tf
.math
.reduce_mean
(sparse_categorical_crossentropy
(y_true
, y_pred
))embeddings
= model
.variables
[0]gp
= tf
.math
.reduce_sum
(tf
.gradients
(loss
, [embeddings
])[0].values
**2)return loss
+ 0.5 * epsilon
* gp
return loss_with_gradient_penalty_2
def main():if not os
.path
.exists
('./checkpoints'):os
.makedirs
('./checkpoints') tb_callback
= tf
.keras
.callbacks
.TensorBoard
(log_dir
='./logs', profile_batch
=0)pretrained_path
= 'model/'config_path
= os
.path
.join
(pretrained_path
, 'bert_config.json')vocab_path
= os
.path
.join
(pretrained_path
, 'vocab.txt')config
= BertConfig
.from_json_file
(config_path
)tokenizer
= BertTokenizer
.from_pretrained
(vocab_path
)bert_ner_model
= BertNerModel
.from_pretrained
(pretrained_path
,config
=config
,from_pt
=True)data
= pd
.read_csv
('data_proceed.csv')data
= data
.dropna
()emotion_g2id
= {}for i
,j
in enumerate(set(data
['情緒標簽'])):emotion_g2id
[j
]=idata
['情緒標簽'] = data
['情緒標簽'].apply(lambda x
:emotion_g2id
[x
])data_size
= len(data
['情緒標簽'])train_size
= data_size
*0.8train_test
= data_size
*0.2steps_per_epoch
= train_size
//16validation_step
= train_test
//16dataset_train
,dataset_test
= encode_
(list(data
['文本']),list(data
['情緒標簽']),tokenizer
)dataset_train
= dataset_train
.shuffle
(999).repeat
().batch
(16)dataset_test
= dataset_test
.batch
(16)optimizer
= tf
.keras
.optimizers
.Adam
(learning_rate
=2e-5)bert_ner_model
.compile(optimizer
=optimizer
, loss
=[loss_with_gradient_penalty
(bert_ner_model
,0.5)] ,metrics
=['sparse_categorical_accuracy'])bert_ner_model
.fit
(dataset_train
,epochs
=5,verbose
=1,steps_per_epoch
=steps_per_epoch
,validation_data
=dataset_test
,validation_steps
=validation_step
)
bert_ner_model
.save_pretrained
('./my_mrpc_model/') if __name__
== '__main__':main
()
訓練結果
總結
結果與比賽的結果還有一些差距,因為在數據預處理和模型調優上沒有做太多嘗試,這里只在3個hidden_layers后接了一個dense直接跟了softmax,可以考慮自己加一些lstm等嘗試一下。
迭代優化
之后會用這個模型去對網易云音樂的評論進行推理,選取置信度較高的樣本加入到原訓練樣本中一起學習,有點自學習那味了。
總結
以上是生活随笔為你收集整理的文本分类(一)EWECT微博情绪分类大赛第三名Bert-Last_3embedding_concat最优单模型复现的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。