Documentation Cheerio v1.2.0
Installation
npm install cheerio
Exemples
1. Analyse HTML de base et sélection d'éléments
Problème : Vous devez extraire des données spécifiques d'un document HTML, comme récupérer les titres de produits d'une page e-commerce.
import * as cheerio from 'cheerio';
// Sample HTML from a product listing page
const html = `
<div class="products">
<div class="product" data-id="123">
<h2 class="title">Wireless Headphones</h2>
<span class="price">$79.99</span>
<p class="description">High-quality wireless headphones with noise cancellation.</p>
</div>
<div class="product" data-id="456">
<h2 class="title">Bluetooth Speaker</h2>
<span class="price">$49.99</span>
<p class="description">Portable speaker with excellent sound quality.</p>
</div>
</div>
`;
// Load the HTML
const $ = cheerio.load(html);
// Extract product information
const products: Array<{
id: string;
title: string;
price: string;
description: string;
}> = [];
$('.product').each((index, element) => {
const $product = $(element);
products.push({
id: $product.attr('data-id') || '',
title: $product.find('.title').text().trim(),
price: $product.find('.price').text().trim(),
description: $product.find('.description').text().trim()
});
});
console.log('Extracted products:', products);
Résultat attendu :
Extracted products: [
{
id: '123',
title: 'Wireless Headphones',
price: '$79.99',
description: 'High-quality wireless headphones with noise cancellation.'
},
{
id: '456',
title: 'Bluetooth Speaker',
price: '$49.99',
description: 'Portable speaker with excellent sound quality.'
}
]
2. Extraction et validation de données de formulaire
Problème : Extraire et valider les données d'un formulaire HTML, couramment nécessaire lors du traitement de formulaires rendus côté serveur.
import * as cheerio from 'cheerio';
const formHtml = `
<form id="user-form" method="post" action="/submit">
<div class="form-group">
<label for="name">Name:</label>
<input type="text" id="name" name="name" value="John Doe" required>
</div>
<div class="form-group">
<label for="email">Email:</label>
<input type="email" id="email" name="email" value="john@example.com" required>
</div>
<div class="form-group">
<label for="country">Country:</label>
<select id="country" name="country">
<option value="us">United States</option>
<option value="ca" selected>Canada</option>
<option value="uk">United Kingdom</option>
</select>
</div>
<div class="form-group">
<input type="checkbox" id="newsletter" name="newsletter" checked>
<label for="newsletter">Subscribe to newsletter</label>
</div>
<button type="submit">Submit</button>
</form>
`;
function extractFormData(html: string): Record<string, any> {
const $ = cheerio.load(html);
const formData: Record<string, any> = {};
const errors: string[] = [];
// Extract text inputs
$('input[type="text"], input[type="email"]').each((_, element) => {
const $input = $(element);
const name = $input.attr('name');
const value = $input.val() as string;
const required = $input.attr('required') !== undefined;
if (name) {
if (required && (!value || value.trim() === '')) {
errors.push(`Field '${name}' is required`);
}
formData[name] = value?.trim() || '';
}
});
// Extract select values
$('select').each((_, element) => {
const $select = $(element);
const name = $select.attr('name');
const selectedOption = $select.find('option[selected]');
if (name) {
formData[name] = selectedOption.attr('value') || '';
}
});
// Extract checkbox values
$('input[type="checkbox"]').each((_, element) => {
const $checkbox = $(element);
const name = $checkbox.attr('name');
if (name) {
formData[name] = $checkbox.attr('checked') !== undefined;
}
});
return {
data: formData,
errors,
isValid: errors.length === 0
};
}
// Usage
const result = extractFormData(formHtml);
console.log('Form extraction result:', result);
// Serialize form data for submission
const $ = cheerio.load(formHtml);
const serializedData = $('#user-form').serializeArray();
console.log('Serialized form data:', serializedData);
Résultat attendu :
Form extraction result: {
data: {
name: 'John Doe',
email: 'john@example.com',
country: 'ca',
newsletter: true
},
errors: [],
isValid: true
}
Serialized form data: [
{ name: 'name', value: 'John Doe' },
{ name: 'email', value: 'john@example.com' },
{ name: 'country', value: 'ca' },
{ name: 'newsletter', value: 'on' }
]
3. Génération et modification dynamiques de HTML
Problème : Modifier le contenu HTML existant en ajoutant, supprimant ou mettant à jour des éléments - utile pour le rendu côté serveur ou le prétraitement HTML.
import * as cheerio from 'cheerio';
const templateHtml = `
<html>
<head>
<title>Blog Post</title>
</head>
<body>
<header>
<nav id="navigation"></nav>
</header>
<main>
<article id="content">
<h1 id="post-title"></h1>
<div id="post-meta"></div>
<div id="post-body"></div>
</article>
<aside id="sidebar"></aside>
</main>
<footer></footer>
</body>
</html>
`;
interface BlogPost {
title: string;
author: string;
date: string;
content: string;
tags: string[];
relatedPosts: Array<{ title: string; url: string }>;
}
function generateBlogPage(template: string, post: BlogPost): string {
const $ = cheerio.load(template);
try {
// Set page title
$('title').text(`${post.title} - My Blog`);
// Add navigation
const navItems = [
{ label: 'Home', url: '/' },
{ label: 'About', url: '/about' },
{ label: 'Contact', url: '/contact' }
];
const navHtml = navItems
.map(item => `<a href="${item.url}">${item.label}</a>`)
.join(' | ');
$('#navigation').html(navHtml);
// Set post content
$('#post-title').text(post.title);
// Add post metadata
const metaHtml = `
<div class="post-meta">
<span class="author">By ${post.author}</span>
<span class="date">${new Date(post.date).toLocaleDateString()}</span>
<div class="tags">
${post.tags.map(tag => `<span class="tag">#${tag}</span>`).join(' ')}
</div>
</div>
`;
$('#post-meta').html(metaHtml);
// Set post body
$('#post-body').html(post.content);
// Add sidebar with related posts
if (post.relatedPosts.length > 0) {
const sidebarHtml = `
<h3>Related Posts</h3>
<ul class="related-posts">
${post.relatedPosts
.map(relatedPost => `<li><a href="${relatedPost.url}">${relatedPost.title}</a></li>`)
.join('')}
</ul>
`;
$('#sidebar').html(sidebarHtml);
}
// Add CSS classes for styling
$('article').addClass('blog-post');
$('.tag').addClass('badge badge-secondary');
// Add footer
$('footer').html('<p>© 2024 My Blog. All rights reserved.</p>');
return $.html();
} catch (error) {
console.error('Error generating blog page:', error);
return template; // Return original template on error
}
}
// Usage example
const blogPost: BlogPost = {
title: 'Getting Started with Cheerio',
author: 'Jane Smith',
date: '2024-01-15',
content: '<p>Cheerio is a powerful server-side HTML manipulation library...</p><p>Here are some key features...</p>',
tags: ['javascript', 'node', 'html', 'scraping'],
relatedPosts: [
{ title: 'Web Scraping Best Practices', url: '/posts/web-scraping-best-practices' },
{ title: 'Node.js HTML Processing', url: '/posts/nodejs-html-processing' }
]
};
const generatedHtml = generateBlogPage(templateHtml, blogPost);
console.log('Generated HTML length:', generatedHtml.length);
console.log('Page title:', cheerio.load(generatedHtml)('title').text());
Résultat attendu :
Generated HTML length: 1247
Page title: Getting Started with Cheerio - My Blog
4. Extraction de données de tableau avec gestion d'erreurs
Problème : Extraire des données structurées de tableaux HTML tout en gérant les données manquantes, les lignes malformées et différentes structures de tableaux.
import * as cheerio from 'cheerio';
const tableHtml = `
<div class="data-container">
<table id="sales-data" class="table">
<thead>
<tr>
<th>Date</th>
<th>Product</th>
<th>Quantity</th>
<th>Price</th>
<th>Total</th>
</tr>
</thead>
<tbody>
<tr>
<td>2024-01-15</td>
<td>Laptop</td>
<td>2</td>
<td data-currency="USD">$1,200.00</td>
<td>$2,400.00</td>
</tr>
<tr>
<td>2024-01-16</td>
<td>Mouse</td>
<td>5</td>
<td>$25.99</td>
<td></td> <!-- Missing total -->
</tr>
<tr class="invalid-row">
<td colspan="5">Invalid data row</td>
</tr>
<tr>
<td>2024-01-17</td>
<td>Keyboard</td>
<td>3</td>
<td>$89.99</td>
<td>$269.97</td>
</tr>
</tbody>
</table>
</div>
`;
interface SalesRecord {
date: string;
product: string;
quantity: number;
price: number;
total: number;
currency?: string;
}
interface TableExtractionResult {
data: SalesRecord[];
errors: string[];
summary: {
totalRows: number;
validRows: number;
invalidRows: number;
};
}
function extractTableData(html: string, tableSelector: string = 'table'): TableExtractionResult {
const $ = cheerio.load(html);
const results: SalesRecord[] = [];
const errors: string[] = [];
let totalRows = 0;
let validRows = 0;
try {
const $table = $(tableSelector);
if ($table.length === 0) {
throw new Error(`Table not found with selector: ${tableSelector}`);
}
// Get headers for reference
const headers: string[] = [];
$table.find('thead th').each((_, element) => {
headers.push($(element).text().trim().toLowerCase());
});
// Process each data row
$table.find('tbody tr').each((rowIndex, row) => {
totalRows++;
const $row = $(row);
// Skip invalid rows (like colspan rows)
if ($row.hasClass('invalid-row') || $row.find('td[colspan]').length > 0) {
errors.push(`Row ${rowIndex + 1}: Skipped invalid row`);
return;
}
const cells = $row.find('td');
// Ensure we have the expected number of cells
if (cells.length !== headers.length) {
errors.push(`Row ${rowIndex + 1}: Expected ${headers.length} cells, found ${cells.length}`);
return;
}
try {
// Extract and parse cell data
const dateText = $(cells[0]).text().trim();
const product = $(cells[1]).text().trim();
const quantityText = $(cells[2]).text().trim();
const priceText = $(cells[3]).text().trim();
const totalText = $(cells[4]).text().trim();
// Parse numeric values
const quantity = parseInt(quantityText, 10);
const price = parseFloat(priceText.replace(/[$,]/g, ''));
let total = totalText ? parseFloat(totalText.replace(/[$,]/g, '')) : 0;
// Validate required fields
if (!dateText || !product || isNaN(quantity) || isNaN(price)) {
errors.push(`Row ${rowIndex + 1}: Missing or invalid required data`);
return;
}
// Calculate total if missing
if (!totalText || isNaN(total)) {
total = quantity * price;
errors.push(`Row ${rowIndex + 1}: Total calculated from quantity × price`);
}
// Extract currency if available
const currency = $(cells[3]).attr('data-currency') || 'USD';
const record: SalesRecord = {
date: dateText,
product,
quantity,
price,
total,
currency
};
results.push(record);
validRows++;
} catch (error) {
errors.push(`Row ${rowIndex + 1}: ${error instanceof Error ? error.message : 'Unknown parsing error'}`);
}
});
} catch (error) {
errors.push(`Table extraction failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
return {
data: results,
errors,
summary: {
totalRows,
validRows,
invalidRows: totalRows - validRows
}
};
}
// Usage
const extractionResult = extractTableData(tableHtml, '#sales-data');
console.log('Extraction Summary:', extractionResult.summary);
console.log('Errors:', extractionResult.errors);
console.log('Valid Records:', extractionResult.data);
// Calculate totals
const grandTotal = extractionResult.data.reduce((sum, record) => sum + record.total, 0);
console.log(`Grand Total: $${grandTotal.toFixed(2)}`);
Résultat attendu :
Extraction Summary: { totalRows: 4, validRows: 3, invalidRows: 1 }
Errors: [
'Row 2: Total calculated from quantity × price',
'Row 3: Skipped invalid row'
]
Valid Records: [
{
date: '2024-01-15',
product: 'Laptop',
quantity: 2,
price: 1200,
total: 2400,
currency: 'USD'
},
{
date: '2024-01-16',
product: 'Mouse',
quantity: 5,
price: 25.99,
total: 129.95,
currency: 'USD'
},
{
date: '2024-01-17',
product: 'Keyboard',
quantity: 3,
price: 89.99,
total: 269.97,
currency: 'USD'
}
]
Grand Total: $2799.92
5. Extraction avancée de données avec CSS et attributs
Problème : Extraire des structures de données complexes incluant les propriétés CSS, les attributs de données et les éléments imbriqués pour une analyse complète du contenu.
import * as cheerio from 'cheerio';
const complexHtml = `
<div class="dashboard">
<div class="widget" data-widget-id="chart-1" data-refresh="30" style="width: 300px; height: 200px; background-color: #f0f0f0;">
<h3 class="widget-title" style="color: #333;">Sales Chart</h3>
<div class="widget-content">
<canvas id="sales-canvas" width="280" height="150"></canvas>
<div class="widget-meta">
<span class="last-updated" data-timestamp="1642291200">Last updated: 2 hours ago</span>
<a href="/reports/sales" class="view-details" target="_blank">View Details</a>
</div>
</div>
</div>
<div class="widget" data-widget-id="stats-1" style="width: 250px; height: 150px;">
<h3 class="widget-title">Key Metrics</h3>
<ul class="stats-list">
<li data-metric="revenue">
<span class="metric-label">Revenue:</span>
<span class="metric-value" data-value="125000">$125,000</span>
</li>
<li data-metric="orders">
<span class="metric-label">Orders:</span>
<span class="metric-value" data-value="1340">1,340</span>
</li>
<li data-metric="conversion" class="metric-highlight">
<span class="metric-label">Conversion Rate:</span>
<span class="metric-value" data-value="3.2">3.2%</span>
</li>
</ul>
</div>
</div>
`;
interface WidgetData {
id: string;
title: string;
type: string;
dimensions: { width: number; height: number };
refreshInterval?: number;
styles: Record<string, string>;
content: any;
links: Array<{ text: string; url: string; target?: string }>;
}
function extractWidgetData(html: string): WidgetData[] {
const $ = cheerio.load(html);
const widgets: WidgetData[] = [];
$('.widget').each((_, element) => {
const $widget = $(element);
try {
// Extract basic widget information
const id = $widget.attr('data-widget-id') || '';
const title = $widget.find('.widget-title').text().trim();
// Parse CSS dimensions
const styleAttr = $widget.attr('style') || '';
const styles = parseInlineStyles(styleAttr);
const dimensions = {
width: parseInt(styles.width) || 0,
height: parseInt(styles.height) || 0
};
// Extract refresh interval if present
const refreshInterval = $widget.attr('data-refresh')
? parseInt($widget.attr('data-refresh') || '0', 10)
: undefined;
// Determine widget type and extract specific content
let content: any = {};
let type = 'unknown';
if ($widget.find('canvas').length > 0) {
type = 'chart';
content = {
canvasId: $widget.find('canvas').attr('id'),
canvasDimensions: {
width: $widget.find('canvas').attr('width'),
height: $widget.find('canvas').attr('height')
},
lastUpdated: $widget.find('[data-timestamp]').attr('data-timestamp')
};
} else if ($widget.find('.stats-list').length > 0) {
type = 'stats';
content = {
metrics: []
};
$widget.find('.stats-list li').each((_, metricElement) => {
const $metric = $(metricElement);
const metricType = $metric.attr('data-metric') || '';
const label = $metric.find('.metric-label').text().replace(':', '').trim();
const displayValue = $metric.find('.metric-value').text().trim();
const rawValue = $metric.find('.metric-value').attr('data-value') || '';
const highlighted = $metric.hasClass('metric-highlight');
content.metrics.push({
type: metricType,
label,
displayValue,
rawValue: parseFloat(rawValue) || 0,
highlighted
});
});
}
// Extract all links in the widget
const links: Array<{ text: string; url: string; target?: string }> = [];
$widget.find('a').each((_, linkElement) => {
const $link = $(linkElement);
const text = $link.text().trim();
const url = $link.attr('href') || '';
const target = $link.attr('target');
if (text && url) {
links.push({ text, url, ...(target && { target }) });
}
});
widgets.push({
id,
title,
type,
dimensions,
refreshInterval,
styles,
content,
links
});
} catch (error) {
console.error(`Error processing widget: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
});
return widgets;
}
function parseInlineStyles(styleString: string): Record<string, string> {
const styles: Record<string, string> = {};
if (!styleString) return styles;
styleString.split(';').forEach(declaration => {
const [property, value] = declaration.split(':').map(s => s.trim());
if (property && value) {
styles[property] = value;
}
});
return styles;
}
// Usage
const widgetData = extractWidgetData(complexHtml);
console.log('Extracted Widgets:', JSON.stringify(widgetData, null, 2));
// Example: Find widgets that need frequent updates
const frequentlyUpdatedWidgets = widgetData.filter(widget =>
widget.refreshInterval && widget.refreshInterval < 60
);
console.log(`Widgets with refresh < 60s: ${frequentlyUpdatedWidgets.length}`);
// Example: Calculate total dashboard area
const totalArea = widgetData.reduce((sum, widget) =>
sum + (widget.dimensions.width * widget.dimensions.height), 0
);
console.log(`Total dashboard area: ${totalArea} square pixels`);
Résultat attendu :
Extracted Widgets: [
{
"id": "chart-1",
"title": "Sales Chart",
"type": "chart",
"dimensions": { "width": 300, "height": 200 },
"refreshInterval": 30,
"styles": {
"width": "300px",
"height": "200px",
"background-color": "#f0f0f0"
},
"content": {
"canvasId": "sales-canvas",
"canvasDimensions": { "width": "280", "height": "150" },
"lastUpdated": "1642291200"
},
"links": [
{ "text": "View Details", "url": "/reports/sales", "target": "_blank" }
]
},
{
"id": "stats-1",
"title": "Key Metrics",
"type": "stats",
"dimensions": { "width": 250, "height": 150 },
"styles": {
"width": "250px",
"height": "150px"
},
"content": {
"metrics": [
{
"type": "revenue",
"label": "Revenue",
"displayValue": "$125,000",
"rawValue": 125000,
"highlighted": false
},
{
"type": "orders",
"label": "Orders",
"displayValue": "1,340",
"rawValue": 1340,
"highlighted": false
},
{
"type": "conversion",
"label": "Conversion Rate",
"displayValue": "3.2%",
"rawValue": 3.2,
"highlighted": true
}
]
},
"links": []
}
]
Widgets with refresh < 60s: 1
Total dashboard area: 97500 square pixels
6. Chargement d'URL et traitement de contenu distant
Problème : Récupérer et traiter le contenu HTML depuis des URL distantes, gérer différents types de contenu et traiter les erreurs de manière élégante.
import * as cheerio from 'cheerio';
interface ScrapingResult {
url: string;
title?: string;
description?: string;
links: Array<{ text: string; href: string; external: boolean }>;
images: Array<{ src: string; alt: string; dimensions?: string }>;
metadata: Record<string, string>;
error?: string;
}
async function scrapeWebPage(url: string): Promise<ScrapingResult> {
const result: ScrapingResult = {
url,
links: [],
images: [],
metadata: {}
};
try {
// Load content from URL
const $ = await cheerio.fromURL(url, {
// Set custom headers
requestOptions: {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; CheerioBot/1.0)'
}
},
// Handle encoding automatically
encoding: {
defaultEncoding: 'utf-8'
}
});
// Extract basic page information
result.title = $('title').text().trim();
// Try multiple selectors for description
result.description =
$('meta[name="description"]').attr('content') ||
$('meta[property="og:description"]').attr('content') ||
$('p').first().text().substring(0, 200) + '...';
// Extract all links
$('a[href]').each((_, element) => {
const $link = $(element);
const href = $link.attr('href') || '';
const text = $link.text().trim();
if (href && text) {
const isExternal = href.startsWith('http') && !href.includes(new URL(url).hostname);
result.links.push({
text,
href,
external: isExternal
});
}
});
// Extract images with metadata
$('img[src]').each((_, element) => {
const $img = $(element);
const src = $img.attr('src') || '';
const alt = $img.attr('alt') || '';
const width = $img.attr('width');
const height = $img.attr('height');
let dimensions: string | undefined;
if (width && height) {
dimensions = `${width}x${height}`;
}
result.images.push({
src: src.startsWith('http') ? src : new URL(src, url).href,
alt,
dimensions
});
});
// Extract metadata from meta tags
$('meta').each((_, element) => {
const $meta = $(element);
const name = $meta.attr('name') || $meta.attr('property') || '';
const content = $meta.attr('content') || '';
if (name && content) {
result.metadata[name] = content;
}
});
// Extract structured data (JSON-LD)
$('script[type="application/ld+json"]').each((_, element) => {
try {
const jsonData = JSON.parse($(element).text());
result.metadata['structured-data'] = JSON.stringify(jsonData);
} catch {
// Ignore malformed JSON-LD
}
});
} catch (error) {
result.error = error instanceof Error ? error.message : 'Unknown error occurred';
}
return result;
}
// Usage example with error handling
async function scrapeMultiplePages(urls: string[]): Promise<ScrapingResult[]> {
const results: ScrapingResult[] = [];
for (const url of urls) {
console.log(`Scraping: ${url}`);
try {
const result = await scrapeWebPage(url);
results.push(result);
// Add delay to be respectful to servers
await new Promise(resolve => setTimeout(resolve, 1000));
} catch (error) {
console.error(`Failed to scrape ${url}:`, error);
results.push({
url,
links: [],
images: [],
metadata: {},
error: error instanceof Error ? error.message : 'Failed to fetch'
});
}
}
return results;
}
// Example usage (commented out to avoid actual HTTP requests in documentation)
/*
async function main() {
const urlsToScrape = [
'https://example.com',
'https://httpbin.org/html',
];
const results = await scrapeMultiplePages(urlsToScrape);
results.forEach(result => {
console.log(`\n--- Results for ${result.url} ---`);
if (result.error) {
console.log('Error:', result.error);
return;
}
console.log('Title:', result.title);
console.log('Description:', result.description?.substring(0, 100) + '...');
console.log('Links found:', result.links.length);
console.log('Images found:', result.images.length);
console.log('External links:', result.links.filter(link => link.external).length);
// Show some metadata
Object.entries(result.metadata).slice(0, 5).forEach(([key, value]) => {
console.log(`Meta ${key}:`, value.substring(0, 50) + '...');
});
});
}
main().catch(console.error);
*/
// For demonstration, here's what the output structure would look like:
const exampleResult: ScrapingResult = {
url: 'https://example.com',
title: 'Example Domain',
description: 'This domain is for use in illustrative examples...',
links: [
{ text: 'More information...', href: 'https://iana.org/domains/example', external: true }
],
images: [
{ src: 'https://example.com/logo.png', alt: 'Example Logo', dimensions: '200x100' }
],
metadata: {
'viewport': 'width=device-width, initial-scale=1',
'og:title': 'Example Domain',
'og:description': 'This domain is for use in illustrative examples'
}
};
console.log('Example scraping result:', JSON.stringify(exampleResult, null, 2));
Résultat attendu :
Example scraping result: {
"url": "https://example.com",
"title": "Example Domain",
"description": "This domain is for use in illustrative examples...",
"links