--- name: Computer Vision description: Implement computer vision tasks including image classification, object detection, segmentation, and pose estimation using PyTorch and TensorFlow --- # Computer Vision ## Overview Computer vision enables machines to understand visual information from images and videos, powering applications like autonomous driving, medical imaging, and surveillance. ## When to Use - Image classification and object recognition tasks - Object detection and localization in images - Semantic or instance segmentation projects - Pose estimation and human activity recognition - Face recognition and biometric systems - Medical imaging analysis and diagnostics ## Computer Vision Tasks - **Image Classification**: Categorizing images into classes - **Object Detection**: Locating and classifying objects in images - **Semantic Segmentation**: Pixel-level classification - **Instance Segmentation**: Detecting individual object instances - **Pose Estimation**: Identifying human body joints - **Face Recognition**: Identifying individuals in images ## Popular Architectures - **Classification**: ResNet, VGG, EfficientNet, Vision Transformer - **Detection**: YOLO, Faster R-CNN, SSD, RetinaNet - **Segmentation**: U-Net, DeepLab, Mask R-CNN - **Pose**: OpenPose, PoseNet, HRNet ## Python Implementation ```python import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.patches as patches from PIL import Image, ImageDraw import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset from torchvision import transforms, models, datasets import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers import cv2 from sklearn.metrics import accuracy_score, confusion_matrix import seaborn as sns import warnings warnings.filterwarnings('ignore') print("=== 1. Image Classification CNN ===") # Define image classification model class ImageClassifierCNN(nn.Module): def __init__(self, num_classes=10): super().__init__() self.features = nn.Sequential( nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(32), nn.MaxPool2d(2, 2), nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(64), nn.MaxPool2d(2, 2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(128), nn.MaxPool2d(2, 2), ) self.classifier = nn.Sequential( nn.Linear(128 * 4 * 4, 256), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(256, num_classes) ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x model = ImageClassifierCNN(num_classes=10) print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") # 2. Object Detection setup print("\n=== 2. Object Detection Framework ===") class ObjectDetector(nn.Module): def __init__(self): super().__init__() # Backbone self.backbone = nn.Sequential( nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2), ) # Bounding box regression self.bbox_head = nn.Sequential( nn.Linear(64 * 8 * 8, 128), nn.ReLU(), nn.Linear(128, 4) # x, y, w, h ) # Class prediction self.class_head = nn.Sequential( nn.Linear(64 * 8 * 8, 128), nn.ReLU(), nn.Linear(128, 10) # 10 classes ) def forward(self, x): features = self.backbone(x) features_flat = features.view(features.size(0), -1) bboxes = self.bbox_head(features_flat) classes = self.class_head(features_flat) return bboxes, classes detector = ObjectDetector() print(f"Detector parameters: {sum(p.numel() for p in detector.parameters()):,}") # 3. Semantic Segmentation print("\n=== 3. Semantic Segmentation U-Net ===") class UNet(nn.Module): def __init__(self, num_classes=5): super().__init__() # Encoder self.enc1 = self._conv_block(3, 32) self.pool1 = nn.MaxPool2d(2, 2) self.enc2 = self._conv_block(32, 64) self.pool2 = nn.MaxPool2d(2, 2) # Bottleneck self.bottleneck = self._conv_block(64, 128) # Decoder self.upconv2 = nn.ConvTranspose2d(128, 64, 2, stride=2) self.dec2 = self._conv_block(128, 64) self.upconv1 = nn.ConvTranspose2d(64, 32, 2, stride=2) self.dec1 = self._conv_block(64, 32) # Final output self.out = nn.Conv2d(32, num_classes, 1) def _conv_block(self, in_channels, out_channels): return nn.Sequential( nn.Conv2d(in_channels, out_channels, 3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(out_channels, out_channels, 3, padding=1), nn.ReLU(inplace=True) ) def forward(self, x): enc1 = self.enc1(x) enc2 = self.enc2(self.pool1(enc1)) bottleneck = self.bottleneck(self.pool2(enc2)) dec2 = self.dec2(torch.cat([self.upconv2(bottleneck), enc2], 1)) dec1 = self.dec1(torch.cat([self.upconv1(dec2), enc1], 1)) return self.out(dec1) unet = UNet(num_classes=5) print(f"U-Net parameters: {sum(p.numel() for p in unet.parameters()):,}") # 4. Transfer Learning print("\n=== 4. Transfer Learning with Pre-trained Models ===") try: # Load pre-trained ResNet18 pretrained_model = models.resnet18(pretrained=True) num_ftrs = pretrained_model.fc.in_features pretrained_model.fc = nn.Linear(num_ftrs, 10) print(f"Pre-trained ResNet18 adapted for 10 classes") print(f"Parameters: {sum(p.numel() for p in pretrained_model.parameters()):,}") except: print("Pre-trained models not available") # 5. Image preprocessing and augmentation print("\n=== 5. Image Preprocessing and Augmentation ===") transform_basic = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) transform_augmented = transforms.Compose([ transforms.RandomRotation(20), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.2, contrast=0.2), transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)), transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) print("Augmentation transforms defined") # 6. Synthetic image data print("\n=== 6. Synthetic Image Data Creation ===") def create_synthetic_images(num_images=100, img_size=32): """Create synthetic images with shapes""" images = [] labels = [] for _ in range(num_images): img = np.ones((img_size, img_size, 3)) * 255 # Randomly draw shapes shape_type = np.random.randint(0, 3) if shape_type == 0: # Circle center = (np.random.randint(5, img_size-5), np.random.randint(5, img_size-5)) radius = np.random.randint(3, 10) cv2.circle(img, center, radius, (0, 0, 0), -1) labels.append(0) elif shape_type == 1: # Rectangle pt1 = (np.random.randint(0, img_size-10), np.random.randint(0, img_size-10)) pt2 = (pt1[0] + np.random.randint(5, 15), pt1[1] + np.random.randint(5, 15)) cv2.rectangle(img, pt1, pt2, (0, 0, 0), -1) labels.append(1) else: # Triangle pts = np.array([[np.random.randint(0, img_size), np.random.randint(0, img_size)], [np.random.randint(0, img_size), np.random.randint(0, img_size)], [np.random.randint(0, img_size), np.random.randint(0, img_size)]]) cv2.drawContours(img, [pts], 0, (0, 0, 0), -1) labels.append(2) images.append(img.astype(np.float32) / 255.0) return np.array(images), np.array(labels) X_images, y_labels = create_synthetic_images(num_images=300, img_size=32) print(f"Synthetic dataset: {X_images.shape}, Labels: {y_labels.shape}") print(f"Class distribution: {np.bincount(y_labels)}") # 7. Visualization print("\n=== 7. Visualization ===") fig, axes = plt.subplots(3, 3, figsize=(12, 10)) # Display synthetic images for i in range(9): idx = i % len(X_images) axes[i // 3, i % 3].imshow(X_images[idx]) axes[i // 3, i % 3].set_title(f"Class {y_labels[idx]}") axes[i // 3, i % 3].axis('off') plt.suptitle("Synthetic Image Dataset", fontsize=14, fontweight='bold') plt.tight_layout() plt.savefig('synthetic_images.png', dpi=100, bbox_inches='tight') print("Synthetic images saved as 'synthetic_images.png'") # 8. Model architectures comparison print("\n=== 8. Architecture Comparison ===") architectures_info = { 'CNN': ImageClassifierCNN(), 'ObjectDetector': ObjectDetector(), 'U-Net': UNet(), } arch_data = { 'Architecture': list(architectures_info.keys()), 'Parameters': [sum(p.numel() for p in m.parameters()) for m in architectures_info.values()], 'Use Case': ['Classification', 'Object Detection', 'Segmentation'] } arch_df = pd.DataFrame(arch_data) print("\nArchitecture Comparison:") print(arch_df.to_string(index=False)) # Visualization fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # Parameters comparison axes[0].barh(arch_df['Architecture'], arch_df['Parameters'], color='steelblue') axes[0].set_xlabel('Number of Parameters') axes[0].set_title('Model Complexity Comparison') axes[0].set_xscale('log') # Use cases use_cases = ['Classification', 'Detection', 'Segmentation', 'Classification', 'Detection', 'Segmentation'] colors_map = {'Classification': 'green', 'Detection': 'orange', 'Segmentation': 'red'} bar_colors = [colors_map[uc] for uc in arch_df['Use Case']] axes[1].bar(arch_df['Architecture'], [1, 1, 1], color=bar_colors, alpha=0.7) axes[1].set_ylabel('Primary Task') axes[1].set_title('Architecture Use Cases') axes[1].set_ylim([0, 1.5]) plt.tight_layout() plt.savefig('cv_architecture_comparison.png', dpi=100, bbox_inches='tight') print("\nArchitecture comparison saved as 'cv_architecture_comparison.png'") # 9. Bounding box visualization print("\n=== 9. Bounding Box Visualization ===") fig, ax = plt.subplots(figsize=(10, 8)) ax.imshow(X_images[0]) # Draw sample bounding boxes bboxes = [ (5, 5, 15, 15), # x1, y1, x2, y2 (18, 10, 28, 20), (8, 20, 18, 28) ] for bbox in bboxes: rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=2, edgecolor='red', facecolor='none') ax.add_patch(rect) ax.set_title('Bounding Box Detection Example') ax.axis('off') plt.savefig('bounding_boxes.png', dpi=100, bbox_inches='tight') print("Bounding box visualization saved as 'bounding_boxes.png'") print("\nComputer vision setup completed!") ``` ## Common CV Architectures - **Classification**: ResNet, EfficientNet, Vision Transformer - **Detection**: YOLO v5, Faster R-CNN, RetinaNet - **Segmentation**: U-Net, DeepLab v3, Mask R-CNN - **Tracking**: SORT, DeepSORT, ByteTrack ## Image Preprocessing - Resizing to standard dimensions - Normalization with ImageNet stats - Data augmentation (rotation, flip, crop) - Color space conversion ## Evaluation Metrics - **Classification**: Accuracy, Precision, Recall, F1 - **Detection**: mAP (mean Average Precision), IoU - **Segmentation**: IoU, Dice coefficient, Hausdorff distance ## Deliverables - Trained vision model - Inference pipeline - Performance evaluation - Visualization results - Model optimization report - Deployment guide