# Advanced Examples

This guide provides comprehensive examples for various crawling scenarios.

## E-commerce Product Scraping

### Basic Product Information

```typescript
import { createScraper } from 'crawlx';

const scraper = createScraper({
  plugins: {
    delay: { enabled: true, defaultDelay: 2000 },
    rateLimit: { enabled: true }
  }
});

const productRule = {
  name: 'h1.product-title',
  price: '.price-current | trim | number',
  originalPrice: '.price-original | trim | number',
  discount: '.discount-percentage | trim',
  rating: '.rating-stars@data-rating | number',
  reviewCount: '.review-count | trim | number',
  availability: '.stock-status',
  images: ['img.product-image@src'],
  description: '.product-description | trim',
  specifications: {
    _scope: '.spec-table tr',
    name: '.spec-name',
    value: '.spec-value'
  },
  variants: {
    _scope: '.variant-option',
    name: '.variant-name',
    price: '.variant-price | number',
    available: '.variant-stock | boolean'
  }
};

const result = await scraper.crawl('https://shop.example.com/product/123', {
  parse: productRule
});

console.log(result.parsed);
await scraper.destroy();
```

### Multi-page Product Catalog

```typescript
import { createSpider } from 'crawlx';

const spider = createSpider({
  concurrency: 5,
  plugins: {
    follow: {
      enabled: true,
      maxDepth: 3,
      sameDomainOnly: true,
      maxLinksPerPage: 50
    },
    duplicateFilter: { enabled: true }
  }
});

const catalogRule = {
  products: {
    _scope: '.product-item',
    name: '.product-name',
    price: '.product-price | trim | number',
    url: '.product-link@href',
    image: '.product-image@src',
    rating: '.product-rating@data-rating | number'
  },
  pagination: {
    nextPage: '.pagination .next@href',
    currentPage: '.pagination .current | number',
    totalPages: '.pagination .total | number'
  }
};

const results = await spider.crawlMany(['https://shop.example.com/category/electronics'], {
  parse: catalogRule,
  follow: '.pagination .next@href, .product-link@href'
});

// Process all products
const allProducts = results
  .map(result => result.parsed.products)
  .flat()
  .filter(Boolean);

console.log(`Found ${allProducts.length} products`);
await spider.destroy();
```

## News and Blog Scraping

### Article Extraction

```typescript
import { createScraper } from 'crawlx';

const newsScraper = createScraper();

const articleRule = {
  headline: 'h1.article-title, h1.headline',
  subheadline: '.article-subtitle, .subheadline',
  author: {
    name: '.author-name, .byline-author',
    url: '.author-link@href',
    bio: '.author-bio'
  },
  publishDate: '.publish-date, .article-date | date',
  updateDate: '.update-date | date',
  category: '.article-category, .section-name',
  tags: ['.article-tags .tag, .keywords .keyword'],
  content: {
    paragraphs: ['.article-content p'],
    images: ['.article-content img@src'],
    videos: ['.article-content video@src']
  },
  metadata: {
    wordCount: ($) => $('.article-content').text().split(/\s+/).length,
    readTime: '.read-time | number',
    shareCount: '.share-count | number'
  },
  relatedArticles: {
    _scope: '.related-articles .article',
    title: '.article-title',
    url: '.article-link@href',
    image: '.article-image@src'
  }
};

const result = await newsScraper.crawl('https://news.example.com/article/123', {
  parse: articleRule
});

console.log(result.parsed);
await newsScraper.destroy();
```

### RSS Feed Processing

```typescript
import { createLightweightCrawler } from 'crawlx';

const feedCrawler = createLightweightCrawler();

const rssRule = {
  title: 'channel > title',
  description: 'channel > description',
  link: 'channel > link',
  lastBuildDate: 'channel > lastBuildDate | date',
  items: {
    _scope: 'item',
    title: 'title',
    link: 'link',
    description: 'description',
    pubDate: 'pubDate | date',
    category: 'category',
    guid: 'guid'
  }
};

const result = await feedCrawler.crawl('https://news.example.com/rss', {
  parse: rssRule
});

console.log(`Found ${result.parsed.items.length} articles`);
await feedCrawler.destroy();
```

## Social Media Monitoring

### Twitter-like Platform

```typescript
import { createMonitor } from 'crawlx';

const socialMonitor = createMonitor({
  plugins: {
    delay: { enabled: true, defaultDelay: 5000 },
    duplicateFilter: { enabled: false } // Allow re-crawling for updates
  }
});

const tweetRule = {
  posts: {
    _scope: '.tweet, .post',
    id: '@data-tweet-id',
    author: {
      username: '.username',
      displayName: '.display-name',
      avatar: '.avatar@src',
      verified: '.verified-badge | boolean'
    },
    content: '.tweet-text, .post-content',
    timestamp: '.timestamp@datetime | date',
    metrics: {
      likes: '.like-count | number',
      retweets: '.retweet-count | number',
      replies: '.reply-count | number'
    },
    media: {
      images: ['.media-image@src'],
      videos: ['.media-video@src']
    },
    hashtags: ['.hashtag'],
    mentions: ['.mention@data-user']
  }
};

// Monitor multiple accounts
const accounts = [
  'https://social.example.com/user1',
  'https://social.example.com/user2',
  'https://social.example.com/user3'
];

const results = await socialMonitor.crawlMany(accounts, {
  parse: tweetRule
});

// Process new posts
results.forEach(result => {
  result.parsed.posts.forEach(post => {
    console.log(`${post.author.username}: ${post.content}`);
  });
});

await socialMonitor.destroy();
```

## Real Estate Listings

```typescript
import { createScraper } from 'crawlx';

const realEstateScraper = createScraper({
  concurrency: 3,
  plugins: {
    delay: { enabled: true, defaultDelay: 3000 }
  }
});

const listingRule = {
  property: {
    address: '.property-address',
    price: '.property-price | trim | number',
    pricePerSqft: '.price-per-sqft | number',
    type: '.property-type',
    status: '.listing-status'
  },
  details: {
    bedrooms: '.bedrooms | number',
    bathrooms: '.bathrooms | number',
    sqft: '.square-feet | number',
    lotSize: '.lot-size',
    yearBuilt: '.year-built | number',
    parking: '.parking-spaces | number'
  },
  features: ['.feature-list .feature'],
  description: '.property-description | trim',
  images: ['.property-images img@src'],
  virtualTour: '.virtual-tour@href',
  agent: {
    name: '.agent-name',
    phone: '.agent-phone',
    email: '.agent-email',
    company: '.agent-company'
  },
  location: {
    neighborhood: '.neighborhood',
    school: '.school-district',
    walkScore: '.walk-score | number'
  },
  history: {
    _scope: '.price-history .entry',
    date: '.history-date | date',
    price: '.history-price | number',
    event: '.history-event'
  }
};

const result = await realEstateScraper.crawl('https://realestate.example.com/listing/123', {
  parse: listingRule
});

console.log(result.parsed);
await realEstateScraper.destroy();
```

## Job Board Scraping

```typescript
import { createSpider } from 'crawlx';

const jobSpider = createSpider({
  plugins: {
    follow: {
      enabled: true,
      maxDepth: 2,
      maxLinksPerPage: 100
    }
  }
});

const jobRule = {
  jobs: {
    _scope: '.job-listing',
    title: '.job-title',
    company: '.company-name',
    location: '.job-location',
    salary: '.salary-range',
    type: '.job-type', // Full-time, Part-time, Contract
    remote: '.remote-option | boolean',
    posted: '.posted-date | date',
    url: '.job-link@href',
    description: '.job-summary',
    requirements: ['.requirements li'],
    benefits: ['.benefits li']
  },
  pagination: {
    nextPage: '.pagination .next@href',
    totalJobs: '.total-results | number'
  }
};

const results = await jobSpider.crawlMany(['https://jobs.example.com/search?q=developer'], {
  parse: jobRule,
  follow: '.pagination .next@href'
});

const allJobs = results
  .map(result => result.parsed.jobs)
  .flat()
  .filter(Boolean);

console.log(`Found ${allJobs.length} job listings`);
await jobSpider.destroy();
```

## API Data Extraction

### JSON API Responses

```typescript
import { createLightweightCrawler } from 'crawlx';

const apiCrawler = createLightweightCrawler({
  headers: {
    'Accept': 'application/json',
    'User-Agent': 'DataBot/1.0'
  }
});

// Custom parser for JSON responses
const jsonRule = {
  data: ($, response) => {
    try {
      return JSON.parse(response.body.toString());
    } catch {
      return null;
    }
  }
};

const result = await apiCrawler.crawl('https://api.example.com/users', {
  parse: jsonRule
});

console.log(result.parsed.data);
await apiCrawler.destroy();
```

## Multi-step Workflows

### Login and Scrape Protected Content

```typescript
import { CrawlX } from 'crawlx';

const crawler = new CrawlX({
  cookies: true, // Enable cookie handling
  headers: {
    'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0)'
  }
});

// Step 1: Login
const loginResult = await crawler.crawl('https://example.com/login', {
  method: 'POST',
  body: new URLSearchParams({
    username: 'your-username',
    password: 'your-password'
  }),
  headers: {
    'Content-Type': 'application/x-www-form-urlencoded'
  }
});

// Step 2: Access protected content
const protectedResult = await crawler.crawl('https://example.com/dashboard', {
  parse: {
    userInfo: '.user-profile',
    notifications: ['.notification'],
    data: '.dashboard-data'
  }
});

console.log(protectedResult.parsed);
await crawler.destroy();
```

## Performance Monitoring

### Website Health Check

```typescript
import { createValidator } from 'crawlx';

const validator = createValidator({
  concurrency: 20,
  timeout: 10000
});

const healthCheckRule = {
  status: (response) => response.statusCode,
  loadTime: (response) => response.timing?.total || 0,
  title: 'title',
  hasErrors: ($) => $('.error, .warning').length > 0,
  resources: {
    images: ($) => $('img').length,
    scripts: ($) => $('script').length,
    stylesheets: ($) => $('link[rel="stylesheet"]').length
  }
};

const urls = [
  'https://example.com',
  'https://example.com/about',
  'https://example.com/contact',
  'https://example.com/products'
];

const results = await validator.crawlMany(urls, {
  parse: healthCheckRule
});

results.forEach(result => {
  const { status, loadTime, hasErrors } = result.parsed;
  console.log(`${result.response.url}: ${status} (${loadTime}ms) ${hasErrors ? '⚠️' : '✅'}`);
});

await validator.destroy();
```

## Custom Filters

```typescript
import { Parser } from 'crawlx';

const parser = new Parser();

// Add custom filters
parser.addFilter('currency', (value) => {
  const num = parseFloat(value.replace(/[^0-9.-]/g, ''));
  return isNaN(num) ? 0 : num;
});

parser.addFilter('phone', (value) => {
  return value.replace(/\D/g, '').replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
});

parser.addFilter('slug', (value) => {
  return value.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
});

// Use custom filters
const customRule = {
  price: '.price | currency',
  phone: '.contact-phone | phone',
  slug: '.title | slug'
};
```

These examples demonstrate the flexibility and power of CrawlX for various web scraping scenarios. Remember to always respect robots.txt files and website terms of service when scraping.