1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| from kobert.utils import get_tokenizer import gluonnlp as nlp tokenizer = get_tokenizer()
bertmodel, vocab = get_pytorch_kobert_model() token = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
data_train = BERTDataset(dataset = train_set_data, sent_idx = 0, label_idx = 1, bert_tokenizer = token, max_len = 64, pad = True, pair = False)
train_set_data = [[i, str(j)] for i, j in zip(train_set['data'], train_set['label'])]
print(train_set_data[0])
print(data_train[0])
(array([ 2, 4688, 6797, 5940, 950, 7795, 4384, 7088, 2573, 5794, 5439, 5007, 5112, 5330, 1370, 517, 54, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32), array(18, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 4)
|