--- name: ML Model Training description: Build and train machine learning models using scikit-learn, PyTorch, and TensorFlow for classification, regression, and clustering tasks --- # ML Model Training Training machine learning models involves selecting appropriate algorithms, preparing data, and optimizing model parameters to achieve strong predictive performance. ## Training Phases - **Data Preparation**: Cleaning, encoding, normalization - **Feature Engineering**: Creating meaningful features - **Model Selection**: Choosing appropriate algorithms - **Hyperparameter Tuning**: Optimizing model settings - **Validation**: Cross-validation and evaluation metrics - **Deployment**: Preparing models for production ## Common Algorithms - **Regression**: Linear, Ridge, Lasso, Random Forest - **Classification**: Logistic, SVM, Random Forest, Gradient Boosting - **Clustering**: K-Means, DBSCAN, Hierarchical - **Neural Networks**: MLPs, CNNs, RNNs, Transformers ## Python Implementation ```python import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score) import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset import tensorflow as tf from tensorflow import keras # 1. Generate synthetic dataset np.random.seed(42) n_samples = 1000 n_features = 20 X = np.random.randn(n_samples, n_features) y = (X[:, 0] + X[:, 1] - X[:, 2] + np.random.randn(n_samples) * 0.5 > 0).astype(int) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Normalize features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) print("Dataset shapes:") print(f"Training: {X_train_scaled.shape}, Testing: {X_test_scaled.shape}") print(f"Class distribution: {np.bincount(y_train)}") # 2. Scikit-learn models print("\n=== Scikit-learn Models ===") models = { 'Logistic Regression': LogisticRegression(max_iter=1000), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42), } sklearn_results = {} for name, model in models.items(): model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] sklearn_results[name] = { 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred), 'recall': recall_score(y_test, y_pred), 'f1': f1_score(y_test, y_pred), 'roc_auc': roc_auc_score(y_test, y_pred_proba) } print(f"\n{name}:") for metric, value in sklearn_results[name].items(): print(f" {metric}: {value:.4f}") # 3. PyTorch neural network print("\n=== PyTorch Model ===") class NeuralNetPyTorch(nn.Module): def __init__(self, input_size): super().__init__() self.fc1 = nn.Linear(input_size, 64) self.fc2 = nn.Linear(64, 32) self.fc3 = nn.Linear(32, 1) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.3) def forward(self, x): x = self.relu(self.fc1(x)) x = self.dropout(x) x = self.relu(self.fc2(x)) x = self.dropout(x) x = torch.sigmoid(self.fc3(x)) return x device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') pytorch_model = NeuralNetPyTorch(n_features).to(device) criterion = nn.BCELoss() optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.001) # Create data loaders train_dataset = TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train).unsqueeze(1)) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # Train PyTorch model epochs = 50 pytorch_losses = [] for epoch in range(epochs): total_loss = 0 for batch_X, batch_y in train_loader: batch_X, batch_y = batch_X.to(device), batch_y.to(device) optimizer.zero_grad() outputs = pytorch_model(batch_X) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() total_loss += loss.item() pytorch_losses.append(total_loss / len(train_loader)) if (epoch + 1) % 10 == 0: print(f"Epoch {epoch + 1}/{epochs}, Loss: {pytorch_losses[-1]:.4f}") # Evaluate PyTorch pytorch_model.eval() with torch.no_grad(): y_pred_pytorch = pytorch_model(torch.FloatTensor(X_test_scaled).to(device)) y_pred_pytorch = (y_pred_pytorch.cpu().numpy() > 0.5).astype(int).flatten() print(f"\nPyTorch Accuracy: {accuracy_score(y_test, y_pred_pytorch):.4f}") # 4. TensorFlow/Keras model print("\n=== TensorFlow/Keras Model ===") tf_model = keras.Sequential([ keras.layers.Dense(64, activation='relu', input_shape=(n_features,)), keras.layers.Dropout(0.3), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.3), keras.layers.Dense(1, activation='sigmoid') ]) tf_model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'] ) history = tf_model.fit( X_train_scaled, y_train, batch_size=32, epochs=50, validation_split=0.2, verbose=0 ) y_pred_tf = (tf_model.predict(X_test_scaled) > 0.5).astype(int).flatten() print(f"TensorFlow Accuracy: {accuracy_score(y_test, y_pred_tf):.4f}") # 5. Visualization fig, axes = plt.subplots(2, 2, figsize=(12, 10)) # Model comparison models_names = list(sklearn_results.keys()) + ['PyTorch', 'TensorFlow'] accuracies = [sklearn_results[m]['accuracy'] for m in sklearn_results.keys()] + \ [accuracy_score(y_test, y_pred_pytorch), accuracy_score(y_test, y_pred_tf)] axes[0, 0].bar(range(len(models_names)), accuracies, color='steelblue') axes[0, 0].set_xticks(range(len(models_names))) axes[0, 0].set_xticklabels(models_names, rotation=45) axes[0, 0].set_ylabel('Accuracy') axes[0, 0].set_title('Model Comparison') axes[0, 0].set_ylim([0, 1]) # Training loss curves axes[0, 1].plot(pytorch_losses, label='PyTorch', linewidth=2) axes[0, 1].plot(history.history['loss'], label='TensorFlow', linewidth=2) axes[0, 1].set_xlabel('Epoch') axes[0, 1].set_ylabel('Loss') axes[0, 1].set_title('Training Loss Comparison') axes[0, 1].legend() axes[0, 1].grid(True, alpha=0.3) # Scikit-learn metrics metrics = ['accuracy', 'precision', 'recall', 'f1'] rf_metrics = [sklearn_results['Random Forest'][m] for m in metrics] axes[1, 0].bar(metrics, rf_metrics, color='coral') axes[1, 0].set_ylabel('Score') axes[1, 0].set_title('Random Forest Metrics') axes[1, 0].set_ylim([0, 1]) # Validation accuracy over epochs axes[1, 1].plot(history.history['accuracy'], label='Training', linewidth=2) axes[1, 1].plot(history.history['val_accuracy'], label='Validation', linewidth=2) axes[1, 1].set_xlabel('Epoch') axes[1, 1].set_ylabel('Accuracy') axes[1, 1].set_title('TensorFlow Training History') axes[1, 1].legend() axes[1, 1].grid(True, alpha=0.3) plt.tight_layout() plt.savefig('model_training_comparison.png', dpi=100, bbox_inches='tight') print("\nVisualization saved as 'model_training_comparison.png'") print("\nModel training completed!") ``` ## Training Best Practices - **Data Split**: 70/15/15 for train/validation/test - **Scaling**: Normalize features before training - **Cross-validation**: Use K-fold for robust evaluation - **Early Stopping**: Prevent overfitting - **Class Balancing**: Handle imbalanced datasets ## Key Metrics - **Accuracy**: Overall correctness - **Precision**: Positive prediction accuracy - **Recall**: True positive detection rate - **F1 Score**: Harmonic mean of precision/recall - **ROC-AUC**: Threshold-independent metric ## Deliverables - Trained model checkpoint - Performance metrics on test set - Feature importance analysis - Learning curves - Hyperparameter configuration - Model evaluation report