1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
| def load_data(file_path: str, tokenizer: PreTrainedTokenizer = None): klue_data = Path(file_path) klue_text = klue_data.read_text().strip() documents = klue_text.split("\n\n")
data_list = [] for doc in documents: char_labels = [] token_labels = [] chars = [] sentence = "" for line in doc.split("\n"): if line.startswith("##"): continue token, tag = line.split("\t") sentence += token char_labels.append(tag) chars.append(token)
offset_mappings = tokenizer(sentence, return_offsets_mapping=True)["offset_mapping"] for offset in offset_mappings: start, end = offset if start == end == 0: continue token_labels.append(char_labels[start])
instance = { "sentence": sentence, "token_label": token_labels, "char_label": char_labels, "offset_mapping": offset_mappings } data_list.append(instance)
return data_list
labels = [ "B-PS", "I-PS", "B-LC", "I-LC", "B-OG", "I-OG", "B-DT", "I-DT", "B-TI", "I-TI", "B-QT", "I-QT", "O", ]
label2id = {label: i for i, label in enumerate(labels)} id2label = {i: label for label, i in label2id.items()}
class NerDataset(Dataset): def __init__( self, tokenizer: PreTrainedTokenizer, examples: List, shuffle: bool = False, **kwargs ): self.dataset = examples self.tokenizer = tokenizer self.max_length = max_length
def __len__(self): return len(self.dataset)
def __getitem__(self, index): instance = self.dataset[index]
return instance
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base") examples = load_data(file_path, tokenizer)
print(examples[0])
dataset = NerDataset( tokenizer=tokenizer, examples=examples, max_length=max_length )
data_loader = DataLoader( dataset=dataset, collate_fn=collate_fn )
for batch in data_loader: print(batch) break
|