This project uses Bert to and the toxic tweet dataset to figure the classes of toxicity that a tweet falls under
The classes of toxicity are as follows:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', \
truncation=True, do_lower_case=True)
classes = list(df.columns)[2:]
df['labels'] = df.iloc[:, 2:].values.tolist()
df['text'] = df['comment_text']
df.drop(['id', 'comment_text'], inplace=True, axis=1)
df.drop(classes, inplace=True, axis=1)
df.head()
def __init__(self, df, tokenizer, max_len):
self.tokenizer = tokenizer
self.df = df
self.texts = df.text
self.targets = self.df.labels
self.max_len = max_len
def __getitem__(self, index):
text = str(self.texts[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True,
truncation=True,
return_attention_mask=True,
)
ids = torch.LongTensor(inputs['input_ids'])
mask = torch.LongTensor(inputs['attention_mask'])
token_type_ids = torch.LongTensor(inputs['token_type_ids'])
targets = torch.FloatTensor(self.targets[index])
return {
'ids': ids,
'mask': mask,
'token_type_ids': token_type_ids,
'targets': targets,
'text': text
}
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss = torch.nn.BCEWithLogitsLoss()
def train(epoch):
model.train()
for i,data in tqdm(enumerate(df_train, 0)):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
optimizer.zero_grad()
out = model(input_ids=ids, attention_mask=mask, labels=targets)
if i%100==0: print(f'epoch: {epoch} | loss: {out[0].item()}')
out[0].backward()
optimizer.step()