推文—情感提取(TensorFlow)

介绍

在本文中,我们展示了如何对数据进行标记、创建问题-答案为目标,以及如何在TensorFlow中为roBERTa构建自定义问题-答案头。

加载包、数据、分词器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

MAX_LEN = 96
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
vocab_file=PATH+'vocab-roberta-base.json',
merges_file=PATH+'merges-roberta-base.txt',
lowercase=True,
add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
train.head()

结果输出为:

训练数据

我们现在将训练数据转换为roBERTa可以理解的数组。以下是输入和目标示例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train.shape[0]):

# FIND OVERLAP
text1 = " "+" ".join(train.loc[k,'text'].split())
text2 = " ".join(train.loc[k,'selected_text'].split())
idx = text1.find(text2)
chars = np.zeros((len(text1)))
chars[idx:idx+len(text2)]=1
if text1[idx-1]==' ': chars[idx-1] = 1
enc = tokenizer.encode(text1)

# ID_OFFSETS
offsets = []; idx=0
for t in enc.ids:
w = tokenizer.decode([t])
offsets.append((idx,idx+len(w)))
idx += len(w)

# START END TOKENS
toks = []
for i,(a,b) in enumerate(offsets):
sm = np.sum(chars[a:b])
if sm>0: toks.append(i)

s_tok = sentiment_id[train.loc[k,'sentiment']]
input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
attention_mask[k,:len(enc.ids)+5] = 1
if len(toks)>0:
start_tokens[k,toks[0]+1] = 1
end_tokens[k,toks[-1]+1] = 1

测试数据

我们必须对测试数据进行标记,就像对训练数据进行标记一样。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna('')

ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
# INPUT_IDS
text1 = " "+" ".join(test.loc[k,'text'].split())
enc = tokenizer.encode(text1)
s_tok = sentiment_id[test.loc[k,'sentiment']]
input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
attention_mask_t[k,:len(enc.ids)+5] = 1

构建roBERTa模型

我们使用预训练的roBERTa,基于模型并添加自定义问答头。第一个标记被输入到bert_model中,我们使用BERT的第一个输出,即下面的x[0]。这些是所有输入标记的嵌入,并且具有形状(batch_size、MAX_LEN、768)。接下来我们应用keras.layers.Conv1D(filters=1, kernel_size=1) 并将嵌入转换为形状(batch_size, MAX_LEN, 1)。然后我们将其展平并应用softmax,因此x1的最终输出具有形状(batch_size, MAX_LEN)。这些是起始标记索引的一种热门编码(对于selected_text)。x2是结束标记索引。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def build_model():
ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
x = bert_model(ids,attention_mask=att,token_type_ids=tok)

x1 = tf.keras.layers.Dropout(0.1)(x[0])
x1 = tf.keras.layers.Conv1D(1,1)(x1)
x1 = tf.keras.layers.Flatten()(x1)
x1 = tf.keras.layers.Activation('softmax')(x1)

x2 = tf.keras.layers.Dropout(0.1)(x[0])
x2 = tf.keras.layers.Conv1D(1,1)(x2)
x2 = tf.keras.layers.Flatten()(x2)
x2 = tf.keras.layers.Activation('softmax')(x2)

model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

return model

Metric

1
2
3
4
5
6
def jaccard(str1, str2): 
a = set(str1.lower().split())
b = set(str2.lower().split())
if (len(a)==0) & (len(b)==0): return 0.5
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))

训练roBERTa模型

我们使用5KFold进行训练(基于情绪分层)。每次折叠时,都会保存最佳模型权重,然后在oof预测和测试预测之前重新加载。然后运行这个笔记本并注释掉model.fit()行。相反,您的笔记本将从model.load_weights()行中的离线训练加载模型权重。更新此以获得正确的路径。另请确保更改下面的KFold种子以匹配您的离线训练。然后这个笔记本将继续使用您的离线模型来预测 oof并预测test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=777)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

print('#'*25)
print('### FOLD %i'%(fold+1))
print('#'*25)

K.clear_session()
model = build_model()

sv = tf.keras.callbacks.ModelCheckpoint(
'%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=True, mode='auto', save_freq='epoch')

model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]],
epochs=3, batch_size=32, verbose=DISPLAY, callbacks=[sv],
validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],
[start_tokens[idxV,], end_tokens[idxV,]]))

print('Loading model...')
model.load_weights('%s-roberta-%i.h5'%(VER,fold))

print('Predicting OOF...')
oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)

print('Predicting Test...')
preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
preds_start += preds[0]/skf.n_splits
preds_end += preds[1]/skf.n_splits

# DISPLAY FOLD JACCARD
all = []
for k in idxV:
a = np.argmax(oof_start[k,])
b = np.argmax(oof_end[k,])
if a>b:
st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
else:
text1 = " "+" ".join(train.loc[k,'text'].split())
enc = tokenizer.encode(text1)
st = tokenizer.decode(enc.ids[a-1:b])
all.append(jaccard(st,train.loc[k,'selected_text']))
jac.append(np.mean(all))
print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
print()

print('>>>> OVERALL 5Fold CV Jaccard =',np.mean(jac))

# >>>> OVERALL 5Fold CV Jaccard = 0.7029308519012358

提交

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
all = []
for k in range(input_ids_t.shape[0]):
a = np.argmax(preds_start[k,])
b = np.argmax(preds_end[k,])
if a>b:
st = test.loc[k,'text']
else:
text1 = " "+" ".join(test.loc[k,'text'].split())
enc = tokenizer.encode(text1)
st = tokenizer.decode(enc.ids[a-1:b])
all.append(st)

test['selected_text'] = all
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)