Clean text data by removing noise and unnecessary elements
pip install neattext
import neattext.functions as nfx def get_clean_text(df): # Remove various text artifacts df['text'] = df['text'].apply(nfx.remove_userhandles) df['text'] = df['text'].apply(nfx.remove_punctuations) df['text'] = df['text'].apply(nfx.remove_emojis) df['text'] = df['text'].apply(nfx.remove_hashtags) df['text'] = df['text'].apply(nfx.remove_html_tags) df['text'] = df['text'].apply(nfx.remove_stopwords) df['text'] = df['text'].apply(nfx.remove_urls) df['text'] = df['text'].apply(nfx.remove_phone_numbers) return df
Convert text into tokens that models can process
from transformers import AutoTokenizer import torch tokenizer = AutoTokenizer.from_pretrained( "path/to/tokenizer", do_lower_case=True ) def tokenize(comment): encoded_dict = tokenizer.encode_plus( str(comment), add_special_tokens=True, max_length=512, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', truncation=True ) return { "input_ids": encoded_dict['input_ids'], "mask": encoded_dict['attention_mask'], "token_type_ids": encoded_dict['token_type_ids'] }
Define a custom BERT-based classification model
from transformers import AutoModel import torch.nn as nn class TextClassification(nn.Module): def __init__(self, n_classes, dropout, model_ckpt): super(TextClassification, self).__init__() self.bert = AutoModel.from_pretrained(model_ckpt) self.dropout = nn.Dropout(dropout) self.out = nn.Linear(768, n_classes) def forward(self, ids, mask, token_type_ids): pooledOut = self.bert( ids, attention_mask=mask, token_type_ids=token_type_ids ) dropOut = self.dropout(pooledOut[1]) output = self.out(dropOut) return output # Initialize model model = TextClassification( n_classes=5, dropout=0.9, model_ckpt="bert-base-uncased" )
Generate predictions from the trained model
def Classification(ids, mask, type_ids, device): # Move tensors to device ids = ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) token_type_ids = type_ids.to(device, dtype=torch.long) # Get model predictions outputs = model( ids=ids, mask=mask, token_type_ids=token_type_ids ) # Apply sigmoid and convert to numpy result = torch.sigmoid(outputs) result = result.cpu().data.numpy() scores = result.tolist() return scores
Learn from labeled training data to make predictions
Discover patterns in unlabeled data
Assess model performance using various metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Calculate evaluation metrics accuracy = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred, average='weighted') recall = recall_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') print(f"Accuracy: {accuracy:.4f}") print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") print(f"F1 Score: {f1:.4f}")
Understanding the building blocks of deep learning
import torch import torch.nn as nn class SimpleNN(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(SimpleNN, self).__init__() self.fc1 = nn.Linear(input_size, hidden_size) self.relu = nn.ReLU() self.fc2 = nn.Linear(hidden_size, output_size) def forward(self, x): x = self.fc1(x) x = self.relu(x) x = self.fc2(x) return x # Create model instance model = SimpleNN(input_size=784, hidden_size=128, output_size=10)
Specialized networks for image processing
Networks for sequential data processing
Why modern AI is a systems problem, not just a modeling problem
Why foundational ML remains essential in LLM-centric systems
Aligning multiple modalities for unified reasoning
Treating LLMs as interfaces, planners, and controllers
Designing AI systems that plan, act, and adapt
Ensuring trustworthy behavior in autonomous systems
Viewing AI as an integrated system
Where modern AI research is heading
Distributed machine learning approach that preserves privacy
The core algorithm for federated learning
def federated_averaging(global_model, client_models, client_weights): """ Aggregate client models using weighted averaging Args: global_model: The current global model client_models: List of updated client models client_weights: Weight for each client (usually based on data size) """ # Initialize aggregated parameters aggregated_params = {} for name, param in global_model.named_parameters(): aggregated_params[name] = torch.zeros_like(param.data) # Weighted averaging for client_model, weight in zip(client_models, client_weights): for name, param in client_model.named_parameters(): aggregated_params[name] += weight * param.data # Update global model for name, param in global_model.named_parameters(): param.data = aggregated_params[name] return global_model
Understanding the limitations and ongoing research areas