Date: 23.07.11 ~ 23.07.16 Writer: 9tailwolf : doryeon514@gm.gist.ac.kr Library and Setting import torch from torch import nn from transformers import AdamW import pandas as pd import numpy as np import random import time import datetime from transformers import get_linear_schedule_with_warmup from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from keras_preprocessing.sequence import pad_sequences from transformers import AutoTokenizer, AutoModel,AutoConfig from sklearn.model_selection import train_test_split device = 'mps' Tokenizer Function def tokenizer(sentence): dict_number = {} dict_word = {} for s in sentence: for word in s.split(): if word not in dict_number.keys(): dict_number[word] = len(dict_number) dict_word[len(dict_number)-1] = word return dict_number, dict_word Making Data Function def make_data(input_ids,attention_masks, labels): train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.03) train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.03) train_labels = train_labels.tolist() validation_labels = validation_labels.tolist() train_inputs = torch.LongTensor(train_inputs) train_labels = torch.FloatTensor(train_labels) train_masks = torch.LongTensor(train_masks) validation_inputs = torch.LongTensor(validation_inputs) validation_labels = torch.FloatTensor(validation_labels) validation_masks = torch.LongTensor(validation_masks) BATCH_SIZE = 16 train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE) return train_data, train_sampler, train_dataloader, validation_data, validation_sampler, validation_dataloader Training Function def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = np.argmax(labels, axis=1).flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) def format_time(elapsed): elapsed_rounded = int(round((elapsed))) return str(datetime.timedelta(seconds=elapsed_rounded)) def training(model, datasets): train_data, train_sampler, train_dataloader, validation_data, validation_sampler, validation_dataloader = datasets optimizer = AdamW(model.parameters(), lr = 4e-5, eps = 1e-8) epochs = 5 total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps) loss_fn = nn.CrossEntropyLoss() seed_val = 100 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) model.zero_grad() for epoch_i in range(0, epochs): t0 = time.time() total_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if step % 10 == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch outputs = model(b_input_ids, attention_mask=b_input_mask) loss = loss_fn(outputs,b_labels) total_loss += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() avg_train_loss = total_loss / len(train_dataloader) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(format_time(time.time() - t0))) print("") print("Running Validation...") t0 = time.time() model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): outputs = model(b_input_ids, attention_mask=b_input_mask) outputs = outputs.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(outputs, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) print(" Validation took: {:}".format(format_time(time.time() - t0))) print("") print("Training complete!") return model Main Function model = KcELECTRA_NN(0.5,3).to(device) tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base") model = training(model, datasets) Test def test_model(fmodel,seq): seq = '[cls] '+seq+' [sep]' sentence = tokenizer.tokenize(seq) test_ids = tokenizer.convert_tokens_to_ids(sentence) test_ids = pad_sequences([test_ids], maxlen=150, dtype='long', truncating='post', padding='post') test_mask = [float(i>0) for i in test_ids[0]] test_ids = torch.tensor(test_ids).to(device) test_mask = torch.tensor([test_mask]).to(device) with torch.no_grad(): outputs = fmodel(test_ids, token_type_ids=None, attention_mask=test_mask) return np.argmax([outputs[0][0].item(),outputs[0][1].item(),outputs[0][2].item()])