import neattext.functions as nfx
def get_clean_text(df):
     df['text'] = df['text'].apply(nfx.remove_userhandles)
     df['text'] = df['text'].apply(nfx.remove_punctuations)
     df['text'] = df['text'].apply(nfx.remove_emojis)
     df['text'] = df['text'].apply(nfx.remove_hashtags)
     df['text'] = df['text'].apply(nfx.remove_html_tags)
     df['text'] = df['text'].apply(nfx.remove_stopwords)
     df['text'] = df['text'].apply(nfx.remove_urls)
     df['text'] = df['text'].apply(nfx.remove_phone_numbers)
     return df

# Tokenize all of the sentences and map the tokens to thier word IDs.
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("path of tokenizer", do_lower_case=True)

def tokenize(comment):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    encoded_dict = tokenizer.encode_plus(
                        str(comment),              # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True, # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation = True
                    )
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)

    return {"input_ids": input_ids, "mask": attention_masks, "token_type_ids": token_type_ids}

# from utils.model import TextClassification, TextClassificationBertLSTM
from transformers import AutoModel
import torch.nn as nn

class TextClassification(nn.Module):
    def __init__ (self, n_classes, dropout, model_ckpt):
        super(TextClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(model_ckpt)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(768, n_classes)
        
    def forward(self, ids, mask, token_type_ids):
        pooledOut = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        dropOut = self.dropout(pooledOut[1])
        output = self.out(dropOut)
        
        return output
    
model = TextClassification(n_classes=5, dropout=0.9, model_ckpt="bert-base-uncased")

def Classification(ids, mask, type_ids, device):

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = type_ids.to(device, dtype=torch.long)
    # targets = targets.to(device, dtype=torch.float)

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    # print("Output: ", outputs)
    result = torch.sigmoid(outputs)
    result = result.cpu().data.numpy()
    scores = result.tolist()
    return scores

Ismail Hossain

Witty Quote

Data Preprocessing Techniques¶

Model Initializing¶

Contents