NLP Tools using advanced language modelling

October 1, 2021

We rely mainly on the excellent hugging face transformers package that builds on pytorch. Excellent tutoring ressources, for example notebooks. Cloud computing can be helpful in following through a script, usually to understand the parts, it is advisable to run it locally. Also as a beginner, shared ressources that often use workload managers can be intimidating and hard to debug. Local installation in the beginning still advisable, although these may change in the future with new developments of usability.

transformers==4.4.2
tokenizers===0.10.1
datasets==1.11.1.dev0
torch==1.9.0+cu102
pandas==1.1.0

Tokenizers

Two approaches currently state-of-the-art, sketch idea and algorithms below.

Wordpiece

BTE (byte level encoding).

Language models

Bug plus, model hub, pretrained models easily avalaible.

Convenient and also ecological (CITE ESTIMATES).

Extend pretrained language models to other tasks by adding layers to them (for example a classification head). Instantiate a model class and add. This way we can take advantage of the

In line with the discussion on ecological aspects of NLP, we can use synergies and resort to pretrained models.

Explain drop-out layer.

Explain twitter-roberta-base. [@barbieriTweetEvalUnifiedBenchmark2020].

Run it like that:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  5 17:05:19 2021

@author: maxpe
"""

from datasets import Dataset

import pandas as pd

from transformers import AutoTokenizer

from transformers import TrainingArguments

from transformers import Trainer

import torch

import transformers


tokenizer = AutoTokenizer.from_pretrained(
    "cardiffnlp/twitter-roberta-base", model_max_length=512)


path = "~/"

dfpre = pd.read_csv(
    path+"2018-E-c-En-train.txt", sep="\t")

dfpre['list'] = dfpre[dfpre.columns[2:]
                      ].values.tolist()
df = dfpre[[
    'Tweet', 'list']].copy()
df.rename(columns={
          'list': 'labels'}, inplace=True)


dftestpre = pd.read_csv(
    path+"2018-E-c-En-test-gold.txt", sep="\t")

dftestpre['list'] = dftestpre[dftestpre.columns[2:]
                              ].values.tolist()
dftest = dftestpre[[
    'Tweet', 'list']].copy()
dftest.rename(columns={
              'list': 'labels'}, inplace=True)


traindataset = Dataset.from_pandas(
    df)

evaldataset = Dataset.from_pandas(
    dftest)


traindataset = traindataset.map(lambda e: tokenizer(
    e['Tweet'], truncation=True, padding='max_length'), batched=True)

traindataset.set_format(type='torch', columns=[
                        'input_ids', 'attention_mask'])


evaldataset = evaldataset.map(lambda e: tokenizer(
    e['Tweet'], truncation=True, padding='max_length'), batched=True)

evaldataset.set_format(type='torch', columns=[
                       'input_ids', 'attention_mask'])


training_loader = torch.utils.data.DataLoader(
    traindataset)

eval_loader = torch.utils.data.DataLoader(
    evaldataset)


def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


class RobertaClass(torch.nn.Module):

    def __init__(self):
        super(
            RobertaClass, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained(
            "cardiffnlp/twitter-roberta-base", return_dict=False)
        self.l2 = torch.nn.Dropout(
            0.3)
        self.l3 = torch.nn.Linear(
            768, 11)

    def forward(self, input_ids, attention_mask, labels):
        _, output_1 = self.l1(
            input_ids=input_ids, attention_mask=attention_mask)
        output_2 = self.l2(
            output_1)
        output = self.l3(
            output_2)

        return (loss_fn(labels.float(), output), output)


model = RobertaClass()

model.train()

training_args = TrainingArguments(
    "test_trainer", do_predict=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=traindataset,
    eval_dataset=evaldataset
)

trainer.train()

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug  4 17:56:24 2021

@author: maxpe
"""

import transformers

from datasets import load_dataset

from transformers import AutoTokenizer

import torch

from tqdm import tqdm

from torch import cuda

import pandas as pd

import sys


# choose GPU when available
device = 'cuda' if cuda.is_available(
) else 'cpu'


file = sys.argv[1]

modelpath = sys.argv[2]

class RobertaClass(torch.nn.Module):

    def __init__(self):
        super(
            RobertaClass, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained(
            "cardiffnlp/twitter-roberta-base")
        self.l2 = torch.nn.Dropout(
            0.3)
        self.l3 = torch.nn.Linear(
            768, 11)

    def forward(self, ids, mask):
        _, output_1 = self.l1(
            ids, attention_mask=mask)
        output_2 = self.l2(
            output_1)
        output = self.l3(
            output_2)
        return output


model = torch.load(
    modelpath)

model.config = transformers.RobertaConfig.from_pretrained(
    "cardiffnlp/twitter-roberta-base")

# set model to eval mode
model.eval()

model = torch.nn.DataParallel(
    model)

model.to(
    device)


tokenizer = AutoTokenizer.from_pretrained(
    "cardiffnlp/twitter-roberta-base", model_max_length=512)


dataset = load_dataset(
    'text', data_files={'test': file})

dataset = dataset.map(lambda e: tokenizer(
    e['text'], truncation=True, padding='max_length'), batched=True)

dataset.set_format(type='torch', columns=[
                   'input_ids', 'attention_mask'])

# Make this smaller when you get a memory error
# The larger the faster generally
BATCH_SIZE = 32

dataloader = torch.utils.data.DataLoader(
    dataset['test'], batch_size=BATCH_SIZE)

# inspect dataloader
# next(iter(dataloader))

open(file+"_11emo",
     "w").close()

with torch.no_grad():
    # exchange the commented lines if you want to have a progress manager
    # for _, data in tqdm(enumerate(dataloader, 0),total=len(dataloader)):
    for _, data in enumerate(dataloader, 0):
        outputs = model(
            data['input_ids'], data['attention_mask'])
        fin_outputs = torch.sigmoid(
            outputs).tolist()
        pd.DataFrame(fin_outputs).to_csv(
            file+"_11emo", index=False, header=False, sep="\t", mode='a')

Run it like that:

python3 predict_11emoclasses.py FILE ~/roberta-twitter-emo_final