DE EN ES FR ID JA KO PT RU TH VI ZH

Tài liệu Cheerio v1.2.0

Cài đặt

npm install cheerio

Ví dụ

1. Phân tích HTML cơ bản và lựa chọn phần tử

Vấn đề: Bạn cần trích xuất dữ liệu cụ thể từ một tài liệu HTML, chẳng hạn như lấy tiêu đề sản phẩm từ trang thương mại điện tử.

import * as cheerio from 'cheerio';

// Sample HTML from a product listing page
const html = `
  <div class="products">
    <div class="product" data-id="123">
      <h2 class="title">Wireless Headphones</h2>
      <span class="price">$79.99</span>
      <p class="description">High-quality wireless headphones with noise cancellation.</p>
    </div>
    <div class="product" data-id="456">
      <h2 class="title">Bluetooth Speaker</h2>
      <span class="price">$49.99</span>
      <p class="description">Portable speaker with excellent sound quality.</p>
    </div>
  </div>
`;

// Load the HTML
const $ = cheerio.load(html);

// Extract product information
const products: Array<{
  id: string;
  title: string;
  price: string;
  description: string;
}> = [];

$('.product').each((index, element) => {
  const $product = $(element);
  
  products.push({
    id: $product.attr('data-id') || '',
    title: $product.find('.title').text().trim(),
    price: $product.find('.price').text().trim(),
    description: $product.find('.description').text().trim()
  });
});

console.log('Extracted products:', products);

Kết quả mong đợi:

Extracted products: [
  {
    id: '123',
    title: 'Wireless Headphones',
    price: '$79.99',
    description: 'High-quality wireless headphones with noise cancellation.'
  },
  {
    id: '456',
    title: 'Bluetooth Speaker',
    price: '$49.99',
    description: 'Portable speaker with excellent sound quality.'
  }
]

2. Trích xuất và xác thực dữ liệu form

Vấn đề: Trích xuất và xác thực dữ liệu form từ một form HTML, thường cần thiết khi xử lý các form được render phía server.

import * as cheerio from 'cheerio';

const formHtml = `
  <form id="user-form" method="post" action="/submit">
    <div class="form-group">
      <label for="name">Name:</label>
      <input type="text" id="name" name="name" value="John Doe" required>
    </div>
    <div class="form-group">
      <label for="email">Email:</label>
      <input type="email" id="email" name="email" value="john@example.com" required>
    </div>
    <div class="form-group">
      <label for="country">Country:</label>
      <select id="country" name="country">
        <option value="us">United States</option>
        <option value="ca" selected>Canada</option>
        <option value="uk">United Kingdom</option>
      </select>
    </div>
    <div class="form-group">
      <input type="checkbox" id="newsletter" name="newsletter" checked>
      <label for="newsletter">Subscribe to newsletter</label>
    </div>
    <button type="submit">Submit</button>
  </form>
`;

function extractFormData(html: string): Record<string, any> {
  const $ = cheerio.load(html);
  const formData: Record<string, any> = {};
  const errors: string[] = [];

  // Extract text inputs
  $('input[type="text"], input[type="email"]').each((_, element) => {
    const $input = $(element);
    const name = $input.attr('name');
    const value = $input.val() as string;
    const required = $input.attr('required') !== undefined;

    if (name) {
      if (required && (!value || value.trim() === '')) {
        errors.push(`Field '${name}' is required`);
      }
      formData[name] = value?.trim() || '';
    }
  });

  // Extract select values
  $('select').each((_, element) => {
    const $select = $(element);
    const name = $select.attr('name');
    const selectedOption = $select.find('option[selected]');
    
    if (name) {
      formData[name] = selectedOption.attr('value') || '';
    }
  });

  // Extract checkbox values
  $('input[type="checkbox"]').each((_, element) => {
    const $checkbox = $(element);
    const name = $checkbox.attr('name');
    
    if (name) {
      formData[name] = $checkbox.attr('checked') !== undefined;
    }
  });

  return {
    data: formData,
    errors,
    isValid: errors.length === 0
  };
}

// Usage
const result = extractFormData(formHtml);
console.log('Form extraction result:', result);

// Serialize form data for submission
const $ = cheerio.load(formHtml);
const serializedData = $('#user-form').serializeArray();
console.log('Serialized form data:', serializedData);

Kết quả mong đợi:

Form extraction result: {
  data: {
    name: 'John Doe',
    email: 'john@example.com',
    country: 'ca',
    newsletter: true
  },
  errors: [],
  isValid: true
}
Serialized form data: [
  { name: 'name', value: 'John Doe' },
  { name: 'email', value: 'john@example.com' },
  { name: 'country', value: 'ca' },
  { name: 'newsletter', value: 'on' }
]

3. Tạo và chỉnh sửa HTML động

Vấn đề: Chỉnh sửa nội dung HTML hiện có bằng cách thêm, xóa hoặc cập nhật các phần tử - hữu ích cho việc render phía server hoặc tiền xử lý HTML.

import * as cheerio from 'cheerio';

const templateHtml = `
  <html>
    <head>
      <title>Blog Post</title>
    </head>
    <body>
      <header>
        <nav id="navigation"></nav>
      </header>
      <main>
        <article id="content">
          <h1 id="post-title"></h1>
          <div id="post-meta"></div>
          <div id="post-body"></div>
        </article>
        <aside id="sidebar"></aside>
      </main>
      <footer></footer>
    </body>
  </html>
`;

interface BlogPost {
  title: string;
  author: string;
  date: string;
  content: string;
  tags: string[];
  relatedPosts: Array<{ title: string; url: string }>;
}

function generateBlogPage(template: string, post: BlogPost): string {
  const $ = cheerio.load(template);

  try {
    // Set page title
    $('title').text(`${post.title} - My Blog`);

    // Add navigation
    const navItems = [
      { label: 'Home', url: '/' },
      { label: 'About', url: '/about' },
      { label: 'Contact', url: '/contact' }
    ];

    const navHtml = navItems
      .map(item => `<a href="${item.url}">${item.label}</a>`)
      .join(' | ');
    
    $('#navigation').html(navHtml);

    // Set post content
    $('#post-title').text(post.title);
    
    // Add post metadata
    const metaHtml = `
      <div class="post-meta">
        <span class="author">By ${post.author}</span>
        <span class="date">${new Date(post.date).toLocaleDateString()}</span>
        <div class="tags">
          ${post.tags.map(tag => `<span class="tag">#${tag}</span>`).join(' ')}
        </div>
      </div>
    `;
    $('#post-meta').html(metaHtml);

    // Set post body
    $('#post-body').html(post.content);

    // Add sidebar with related posts
    if (post.relatedPosts.length > 0) {
      const sidebarHtml = `
        <h3>Related Posts</h3>
        <ul class="related-posts">
          ${post.relatedPosts
            .map(relatedPost => `<li><a href="${relatedPost.url}">${relatedPost.title}</a></li>`)
            .join('')}
        </ul>
      `;
      $('#sidebar').html(sidebarHtml);
    }

    // Add CSS classes for styling
    $('article').addClass('blog-post');
    $('.tag').addClass('badge badge-secondary');

    // Add footer
    $('footer').html('<p>&copy; 2024 My Blog. All rights reserved.</p>');

    return $.html();

  } catch (error) {
    console.error('Error generating blog page:', error);
    return template; // Return original template on error
  }
}

// Usage example
const blogPost: BlogPost = {
  title: 'Getting Started with Cheerio',
  author: 'Jane Smith',
  date: '2024-01-15',
  content: '<p>Cheerio is a powerful server-side HTML manipulation library...</p><p>Here are some key features...</p>',
  tags: ['javascript', 'node', 'html', 'scraping'],
  relatedPosts: [
    { title: 'Web Scraping Best Practices', url: '/posts/web-scraping-best-practices' },
    { title: 'Node.js HTML Processing', url: '/posts/nodejs-html-processing' }
  ]
};

const generatedHtml = generateBlogPage(templateHtml, blogPost);
console.log('Generated HTML length:', generatedHtml.length);
console.log('Page title:', cheerio.load(generatedHtml)('title').text());

Kết quả mong đợi:

Generated HTML length: 1247
Page title: Getting Started with Cheerio - My Blog

4. Trích xuất dữ liệu bảng với xử lý lỗi

Vấn đề: Trích xuất dữ liệu có cấu trúc từ các bảng HTML đồng thời xử lý dữ liệu bị thiếu, các hàng không đúng định dạng và các cấu trúc bảng khác nhau.

import * as cheerio from 'cheerio';

const tableHtml = `
  <div class="data-container">
    <table id="sales-data" class="table">
      <thead>
        <tr>
          <th>Date</th>
          <th>Product</th>
          <th>Quantity</th>
          <th>Price</th>
          <th>Total</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>2024-01-15</td>
          <td>Laptop</td>
          <td>2</td>
          <td data-currency="USD">$1,200.00</td>
          <td>$2,400.00</td>
        </tr>
        <tr>
          <td>2024-01-16</td>
          <td>Mouse</td>
          <td>5</td>
          <td>$25.99</td>
          <td></td> <!-- Missing total -->
        </tr>
        <tr class="invalid-row">
          <td colspan="5">Invalid data row</td>
        </tr>
        <tr>
          <td>2024-01-17</td>
          <td>Keyboard</td>
          <td>3</td>
          <td>$89.99</td>
          <td>$269.97</td>
        </tr>
      </tbody>
    </table>
  </div>
`;

interface SalesRecord {
  date: string;
  product: string;
  quantity: number;
  price: number;
  total: number;
  currency?: string;
}

interface TableExtractionResult {
  data: SalesRecord[];
  errors: string[];
  summary: {
    totalRows: number;
    validRows: number;
    invalidRows: number;
  };
}

function extractTableData(html: string, tableSelector: string = 'table'): TableExtractionResult {
  const $ = cheerio.load(html);
  const results: SalesRecord[] = [];
  const errors: string[] = [];
  let totalRows = 0;
  let validRows = 0;

  try {
    const $table = $(tableSelector);
    
    if ($table.length === 0) {
      throw new Error(`Table not found with selector: ${tableSelector}`);
    }

    // Get headers for reference
    const headers: string[] = [];
    $table.find('thead th').each((_, element) => {
      headers.push($(element).text().trim().toLowerCase());
    });

    // Process each data row
    $table.find('tbody tr').each((rowIndex, row) => {
      totalRows++;
      const $row = $(row);
      
      // Skip invalid rows (like colspan rows)
      if ($row.hasClass('invalid-row') || $row.find('td[colspan]').length > 0) {
        errors.push(`Row ${rowIndex + 1}: Skipped invalid row`);
        return;
      }

      const cells = $row.find('td');
      
      // Ensure we have the expected number of cells
      if (cells.length !== headers.length) {
        errors.push(`Row ${rowIndex + 1}: Expected ${headers.length} cells, found ${cells.length}`);
        return;
      }

      try {
        // Extract and parse cell data
        const dateText = $(cells[0]).text().trim();
        const product = $(cells[1]).text().trim();
        const quantityText = $(cells[2]).text().trim();
        const priceText = $(cells[3]).text().trim();
        const totalText = $(cells[4]).text().trim();

        // Parse numeric values
        const quantity = parseInt(quantityText, 10);
        const price = parseFloat(priceText.replace(/[$,]/g, ''));
        let total = totalText ? parseFloat(totalText.replace(/[$,]/g, '')) : 0;

        // Validate required fields
        if (!dateText || !product || isNaN(quantity) || isNaN(price)) {
          errors.push(`Row ${rowIndex + 1}: Missing or invalid required data`);
          return;
        }

        // Calculate total if missing
        if (!totalText || isNaN(total)) {
          total = quantity * price;
          errors.push(`Row ${rowIndex + 1}: Total calculated from quantity × price`);
        }

        // Extract currency if available
        const currency = $(cells[3]).attr('data-currency') || 'USD';

        const record: SalesRecord = {
          date: dateText,
          product,
          quantity,
          price,
          total,
          currency
        };

        results.push(record);
        validRows++;

      } catch (error) {
        errors.push(`Row ${rowIndex + 1}: ${error instanceof Error ? error.message : 'Unknown parsing error'}`);
      }
    });

  } catch (error) {
    errors.push(`Table extraction failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
  }

  return {
    data: results,
    errors,
    summary: {
      totalRows,
      validRows,
      invalidRows: totalRows - validRows
    }
  };
}

// Usage
const extractionResult = extractTableData(tableHtml, '#sales-data');

console.log('Extraction Summary:', extractionResult.summary);
console.log('Errors:', extractionResult.errors);
console.log('Valid Records:', extractionResult.data);

// Calculate totals
const grandTotal = extractionResult.data.reduce((sum, record) => sum + record.total, 0);
console.log(`Grand Total: $${grandTotal.toFixed(2)}`);

Kết quả mong đợi:

Extraction Summary: { totalRows: 4, validRows: 3, invalidRows: 1 }
Errors: [
  'Row 2: Total calculated from quantity × price',
  'Row 3: Skipped invalid row'
]
Valid Records: [
  {
    date: '2024-01-15',
    product: 'Laptop',
    quantity: 2,
    price: 1200,
    total: 2400,
    currency: 'USD'
  },
  {
    date: '2024-01-16',
    product: 'Mouse',
    quantity: 5,
    price: 25.99,
    total: 129.95,
    currency: 'USD'
  },
  {
    date: '2024-01-17',
    product: 'Keyboard',
    quantity: 3,
    price: 89.99,
    total: 269.97,
    currency: 'USD'
  }
]
Grand Total: $2799.92

5. Trích xuất dữ liệu nâng cao với CSS và thuộc tính

Vấn đề: Trích xuất các cấu trúc dữ liệu phức tạp bao gồm thuộc tính CSS, thuộc tính data và các phần tử lồng nhau để phân tích nội dung toàn diện.

import * as cheerio from 'cheerio';

const complexHtml = `
  <div class="dashboard">
    <div class="widget" data-widget-id="chart-1" data-refresh="30" style="width: 300px; height: 200px; background-color: #f0f0f0;">
      <h3 class="widget-title" style="color: #333;">Sales Chart</h3>
      <div class="widget-content">
        <canvas id="sales-canvas" width="280" height="150"></canvas>
        <div class="widget-meta">
          <span class="last-updated" data-timestamp="1642291200">Last updated: 2 hours ago</span>
          <a href="/reports/sales" class="view-details" target="_blank">View Details</a>
        </div>
      </div>
    </div>
    
    <div class="widget" data-widget-id="stats-1" style="width: 250px; height: 150px;">
      <h3 class="widget-title">Key Metrics</h3>
      <ul class="stats-list">
        <li data-metric="revenue">
          <span class="metric-label">Revenue:</span>
          <span class="metric-value" data-value="125000">$125,000</span>
        </li>
        <li data-metric="orders">
          <span class="metric-label">Orders:</span>
          <span class="metric-value" data-value="1340">1,340</span>
        </li>
        <li data-metric="conversion" class="metric-highlight">
          <span class="metric-label">Conversion Rate:</span>
          <span class="metric-value" data-value="3.2">3.2%</span>
        </li>
      </ul>
    </div>
  </div>
`;

interface WidgetData {
  id: string;
  title: string;
  type: string;
  dimensions: { width: number; height: number };
  refreshInterval?: number;
  styles: Record<string, string>;
  content: any;
  links: Array<{ text: string; url: string; target?: string }>;
}

function extractWidgetData(html: string): WidgetData[] {
  const $ = cheerio.load(html);
  const widgets: WidgetData[] = [];

  $('.widget').each((_, element) => {
    const $widget = $(element);

    try {
      // Extract basic widget information
      const id = $widget.attr('data-widget-id') || '';
      const title = $widget.find('.widget-title').text().trim();
      
      // Parse CSS dimensions
      const styleAttr = $widget.attr('style') || '';
      const styles = parseInlineStyles(styleAttr);
      
      const dimensions = {
        width: parseInt(styles.width) || 0,
        height: parseInt(styles.height) || 0
      };

      // Extract refresh interval if present
      const refreshInterval = $widget.attr('data-refresh') 
        ? parseInt($widget.attr('data-refresh') || '0', 10) 
        : undefined;

      // Determine widget type and extract specific content
      let content: any = {};
      let type = 'unknown';

      if ($widget.find('canvas').length > 0) {
        type = 'chart';
        content = {
          canvasId: $widget.find('canvas').attr('id'),
          canvasDimensions: {
            width: $widget.find('canvas').attr('width'),
            height: $widget.find('canvas').attr('height')
          },
          lastUpdated: $widget.find('[data-timestamp]').attr('data-timestamp')
        };
      } else if ($widget.find('.stats-list').length > 0) {
        type = 'stats';
        content = {
          metrics: []
        };

        $widget.find('.stats-list li').each((_, metricElement) => {
          const $metric = $(metricElement);
          const metricType = $metric.attr('data-metric') || '';
          const label = $metric.find('.metric-label').text().replace(':', '').trim();
          const displayValue = $metric.find('.metric-value').text().trim();
          const rawValue = $metric.find('.metric-value').attr('data-value') || '';
          const highlighted = $metric.hasClass('metric-highlight');

          content.metrics.push({
            type: metricType,
            label,
            displayValue,
            rawValue: parseFloat(rawValue) || 0,
            highlighted
          });
        });
      }

      // Extract all links in the widget
      const links: Array<{ text: string; url: string; target?: string }> = [];
      $widget.find('a').each((_, linkElement) => {
        const $link = $(linkElement);
        const text = $link.text().trim();
        const url = $link.attr('href') || '';
        const target = $link.attr('target');

        if (text && url) {
          links.push({ text, url, ...(target && { target }) });
        }
      });

      widgets.push({
        id,
        title,
        type,
        dimensions,
        refreshInterval,
        styles,
        content,
        links
      });

    } catch (error) {
      console.error(`Error processing widget: ${error instanceof Error ? error.message : 'Unknown error'}`);
    }
  });

  return widgets;
}

function parseInlineStyles(styleString: string): Record<string, string> {
  const styles: Record<string, string> = {};
  
  if (!styleString) return styles;

  styleString.split(';').forEach(declaration => {
    const [property, value] = declaration.split(':').map(s => s.trim());
    if (property && value) {
      styles[property] = value;
    }
  });

  return styles;
}

// Usage
const widgetData = extractWidgetData(complexHtml);

console.log('Extracted Widgets:', JSON.stringify(widgetData, null, 2));

// Example: Find widgets that need frequent updates
const frequentlyUpdatedWidgets = widgetData.filter(widget => 
  widget.refreshInterval && widget.refreshInterval < 60
);

console.log(`Widgets with refresh < 60s: ${frequentlyUpdatedWidgets.length}`);

// Example: Calculate total dashboard area
const totalArea = widgetData.reduce((sum, widget) => 
  sum + (widget.dimensions.width * widget.dimensions.height), 0
);

console.log(`Total dashboard area: ${totalArea} square pixels`);

Kết quả mong đợi:

Extracted Widgets: [
  {
    "id": "chart-1",
    "title": "Sales Chart",
    "type": "chart",
    "dimensions": { "width": 300, "height": 200 },
    "refreshInterval": 30,
    "styles": {
      "width": "300px",
      "height": "200px",
      "background-color": "#f0f0f0"
    },
    "content": {
      "canvasId": "sales-canvas",
      "canvasDimensions": { "width": "280", "height": "150" },
      "lastUpdated": "1642291200"
    },
    "links": [
      { "text": "View Details", "url": "/reports/sales", "target": "_blank" }
    ]
  },
  {
    "id": "stats-1",
    "title": "Key Metrics",
    "type": "stats",
    "dimensions": { "width": 250, "height": 150 },
    "styles": {
      "width": "250px",
      "height": "150px"
    },
    "content": {
      "metrics": [
        {
          "type": "revenue",
          "label": "Revenue",
          "displayValue": "$125,000",
          "rawValue": 125000,
          "highlighted": false
        },
        {
          "type": "orders",
          "label": "Orders",
          "displayValue": "1,340",
          "rawValue": 1340,
          "highlighted": false
        },
        {
          "type": "conversion",
          "label": "Conversion Rate",
          "displayValue": "3.2%",
          "rawValue": 3.2,
          "highlighted": true
        }
      ]
    },
    "links": []
  }
]
Widgets with refresh < 60s: 1
Total dashboard area: 97500 square pixels

6. Tải URL và xử lý nội dung từ xa

Vấn đề: Lấy và xử lý nội dung HTML từ các URL từ xa, xử lý các loại nội dung khác nhau và quản lý lỗi một cách khéo léo.

import * as cheerio from 'cheerio';

interface ScrapingResult {
  url: string;
  title?: string;
  description?: string;
  links: Array<{ text: string; href: string; external: boolean }>;
  images: Array<{ src: string; alt: string; dimensions?: string }>;
  metadata: Record<string, string>;
  error?: string;
}

async function scrapeWebPage(url: string): Promise<ScrapingResult> {
  const result: ScrapingResult = {
    url,
    links: [],
    images: [],
    metadata: {}
  };

  try {
    // Load content from URL
    const $ = await cheerio.fromURL(url, {
      // Set custom headers
      requestOptions: {
        headers: {
          'User-Agent': 'Mozilla/5.0 (compatible; CheerioBot/1.0)'
        }
      },
      // Handle encoding automatically
      encoding: {
        defaultEncoding: 'utf-8'
      }
    });

    // Extract basic page information
    result.title = $('title').text().trim();
    
    // Try multiple selectors for description
    result.description = 
      $('meta[name="description"]').attr('content') ||
      $('meta[property="og:description"]').attr('content') ||
      $('p').first().text().substring(0, 200) + '...';

    // Extract all links
    $('a[href]').each((_, element) => {
      const $link = $(element);
      const href = $link.attr('href') || '';
      const text = $link.text().trim();

      if (href && text) {
        const isExternal = href.startsWith('http') && !href.includes(new URL(url).hostname);
        
        result.links.push({
          text,
          href,
          external: isExternal
        });
      }
    });

    // Extract images with metadata
    $('img[src]').each((_, element) => {
      const $img = $(element);
      const src = $img.attr('src') || '';
      const alt = $img.attr('alt') || '';
      const width = $img.attr('width');
      const height = $img.attr('height');
      
      let dimensions: string | undefined;
      if (width && height) {
        dimensions = `${width}x${height}`;
      }

      result.images.push({
        src: src.startsWith('http') ? src : new URL(src, url).href,
        alt,
        dimensions
      });
    });

    // Extract metadata from meta tags
    $('meta').each((_, element) => {
      const $meta = $(element);
      const name = $meta.attr('name') || $meta.attr('property') || '';
      const content = $meta.attr('content') || '';
      
      if (name && content) {
        result.metadata[name] = content;
      }
    });

    // Extract structured data (JSON-LD)
    $('script[type="application/ld+json"]').each((_, element) => {
      try {
        const jsonData = JSON.parse($(element).text());
        result.metadata['structured-data'] = JSON.stringify(jsonData);
      } catch {
        // Ignore malformed JSON-LD
      }
    });

  } catch (error) {
    result.error = error instanceof Error ? error.message : 'Unknown error occurred';
  }

  return result;
}

// Usage example with error handling
async function scrapeMultiplePages(urls: string[]): Promise<ScrapingResult[]> {
  const results: ScrapingResult[] = [];
  
  for (const url of urls) {
    console.log(`Scraping: ${url}`);
    
    try {
      const result = await scrapeWebPage(url);
      results.push(result);
      
      // Add delay to be respectful to servers
      await new Promise(resolve => setTimeout(resolve, 1000));
      
    } catch (error) {
      console.error(`Failed to scrape ${url}:`, error);
      results.push({
        url,
        links: [],
        images: [],
        metadata: {},
        error: error instanceof Error ? error.message : 'Failed to fetch'
      });
    }
  }
  
  return results;
}

// Example usage (commented out to avoid actual HTTP requests in documentation)
/*
async function main() {
  const urlsToScrape = [
    'https://example.com',
    'https://httpbin.org/html',
  ];

  const results = await scrapeMultiplePages(urlsToScrape);
  
  results.forEach(result => {
    console.log(`\n--- Results for ${result.url} ---`);
    
    if (result.error) {
      console.log('Error:', result.error);
      return;
    }
    
    console.log('Title:', result.title);
    console.log('Description:', result.description?.substring(0, 100) + '...');
    console.log('Links found:', result.links.length);
    console.log('Images found:', result.images.length);
    console.log('External links:', result.links.filter(link => link.external).length);
    
    // Show some metadata
    Object.entries(result.metadata).slice(0, 5).forEach(([key, value]) => {
      console.log(`Meta ${key}:`, value.substring(0, 50) + '...');
    });
  });
}

main().catch(console.error);
*/

// For demonstration, here's what the output structure would look like:
const exampleResult: ScrapingResult = {
  url: 'https://example.com',
  title: 'Example Domain',
  description: 'This domain is for use in illustrative examples...',
  links: [
    { text: 'More information...', href: 'https://iana.org/domains/example', external: true }
  ],
  images: [
    { src: 'https://example.com/logo.png', alt: 'Example Logo', dimensions: '200x100' }
  ],
  metadata: {
    'viewport': 'width=device-width, initial-scale=1',
    'og:title': 'Example Domain',
    'og:description': 'This domain is for use in illustrative examples'
  }
};

console.log('Example scraping result:', JSON.stringify(exampleResult, null, 2));

Kết quả mong đợi:

Example scraping result: {
  "url": "https://example.com",
  "title": "Example Domain",
  "description": "This domain is for use in illustrative examples...",
  "links