Comprehensive Cheerio Tutorial Guides
Guide 1: How to Extract Data from HTML Tables with Cheerio
Web scraping table data is one of the most common tasks when working with HTML documents. Cheerio makes it incredibly easy to parse table structures and extract meaningful data. This guide will walk you through various techniques for extracting data from HTML tables using Cheerio.
Setting Up Your Environment
First, install Cheerio in your project:
npm install cheerio
Basic Table Structure Understanding
Before diving into extraction techniques, let's understand a typical HTML table structure:
<table id="products">
<thead>
<tr>
<th>Product Name</th>
<th>Price</th>
<th>Stock</th>
</tr>
</thead>
<tbody>
<tr>
<td>Laptop</td>
<td>$999</td>
<td>15</td>
</tr>
<tr>
<td>Mouse</td>
<td>$25</td>
<td>50</td>
</tr>
</tbody>
</table>
Step 1: Loading HTML and Basic Table Selection
Start by loading your HTML content with Cheerio:
import * as cheerio from 'cheerio';
const html = `
<table id="products">
<thead>
<tr>
<th>Product Name</th>
<th>Price</th>
<th>Stock</th>
</tr>
</thead>
<tbody>
<tr>
<td>Laptop</td>
<td>$999</td>
<td>15</td>
</tr>
<tr>
<td>Mouse</td>
<td>$25</td>
<td>50</td>
</tr>
</tbody>
</table>
`;
const $ = cheerio.load(html);
Step 2: Extracting Table Headers
Table headers provide the structure for your data. Extract them first:
const headers: string[] = [];
$('#products thead th').each((index, element) => {
headers.push($(element).text().trim());
});
console.log('Headers:', headers);
// Output: ['Product Name', 'Price', 'Stock']
Step 3: Extracting Row Data
Now extract the actual table data. There are several approaches depending on your needs:
Method 1: Row-by-row extraction
const rows: string[][] = [];
$('#products tbody tr').each((index, row) => {
const rowData: string[] = [];
$(row).find('td').each((cellIndex, cell) => {
rowData.push($(cell).text().trim());
});
rows.push(rowData);
});
console.log('Rows:', rows);
// Output: [['Laptop', '$999', '15'], ['Mouse', '$25', '50']]
Method 2: Creating structured objects
interface Product {
productName: string;
price: string;
stock: number;
}
const products: Product[] = [];
$('#products tbody tr').each((index, row) => {
const cells = $(row).find('td');
const product: Product = {
productName: $(cells[0]).text().trim(),
price: $(cells[1]).text().trim(),
stock: parseInt($(cells[2]).text().trim())
};
products.push(product);
});
console.log('Products:', products);
Step 4: Handling Complex Tables
Real-world tables often have merged cells, nested elements, or special formatting. Here's how to handle them:
const complexHtml = `
<table class="sales-data">
<tr>
<td rowspan="2">Q1</td>
<td>January</td>
<td>$<span class="amount">15000</span></td>
<td class="status success">✓</td>
</tr>
<tr>
<td>February</td>
<td>$<span class="amount">18000</span></td>
<td class="status pending">⏳</td>
</tr>
</table>
`;
const $complex = cheerio.load(complexHtml);
const salesData = [];
$complex('.sales-data tr').each((index, row) => {
const cells = $complex(row).find('td');
const record = {
quarter: cells.length === 4 ? $complex(cells[0]).text() : 'Q1', // Handle rowspan
month: $complex(cells[cells.length === 4 ? 1 : 0]).text(),
amount: $complex(cells[cells.length === 4 ? 2 : 1]).find('.amount').text(),
status: $complex(cells[cells.length === 4 ? 3 : 2]).attr('class')?.includes('success') ? 'Complete' : 'Pending'
};
salesData.push(record);
});
Step 5: Advanced Filtering and Data Processing
Use Cheerio's powerful selectors to filter and process your data:
// Extract only rows with specific conditions
const highStockProducts = [];
$('#products tbody tr').each((index, row) => {
const stock = parseInt($(row).find('td:nth-child(3)').text());
if (stock > 20) {
highStockProducts.push({
name: $(row).find('td:first-child').text(),
stock: stock
});
}
});
// Extract data using attribute selectors
$('table[data-category="electronics"] tbody tr').each((index, row) => {
// Process electronics category tables only
});
Complete Working Example
Here's a comprehensive example that demonstrates table data extraction:
import * as cheerio from 'cheerio';
// Sample HTML with a product catalog table
const html = `
<!DOCTYPE html>
<html>
<body>
<table id="product-catalog" class="data-table">
<thead>
<tr>
<th>ID</th>
<th>Product Name</th>
<th>Category</th>
<th>Price</th>
<th>Stock</th>
<th>Rating</th>
</tr>
</thead>
<tbody>
<tr data-id="1">
<td>001</td>
<td><a href="/laptop-pro">Laptop Pro</a></td>
<td>Electronics</td>
<td>$<span class="price">1299</span></td>
<td class="stock-high">25</td>
<td><span class="rating" data-score="4.5">★★★★☆</span></td>
</tr>
<tr data-id="2">
<td>002</td>
<td><a href="/wireless-mouse">Wireless Mouse</a></td>
<td>Electronics</td>
<td>$<span class="price">35</span></td>
<td class="stock-medium">12</td>
<td><span class="rating" data-score="4.2">★★★★☆</span></td>
</tr>
<tr data-id="3">
<td>003</td>
<td><a href="/desk-lamp">LED Desk Lamp</a></td>
<td>Office</td>
<td>$<span class="price">89</span></td>
<td class="stock-low">3</td>
<td><span class="rating" data-score="4.7">★★★★★</span></td>
</tr>
</tbody>
</table>
</body>
</html>
`;
interface Product {
id: string;
name: string;
category: string;
price: number;
stock: number;
rating: number;
url: string;
stockLevel: 'low' | 'medium' | 'high';
}
function extractTableData(html: string): Product[] {
const $ = cheerio.load(html);
const products: Product[] = [];
$('#product-catalog tbody tr').each((index, row) => {
const $row = $(row);
// Extract basic data
const id = $row.find('td:nth-child(1)').text().trim();
const nameElement = $row.find('td:nth-child(2) a');
const name = nameElement.text().trim();
const url = nameElement.attr('href') || '';
const category = $row.find('td:nth-child(3)').text().trim();
// Extract and parse price
const priceText = $row.find('.price').text();
const price = parseFloat(priceText);
// Extract stock with level detection
const stockCell = $row.find('td:nth-child(5)');
const stock = parseInt(stockCell.text().trim());
let stockLevel: 'low' | 'medium' | 'high' = 'medium';
if (stockCell.hasClass('stock-low')) stockLevel = 'low';
else if (stockCell.hasClass('stock-high')) stockLevel = 'high';
// Extract rating
const rating = parseFloat($row.find('.rating').attr('data-score') || '0');
products.push({
id,
name,
category,
price,
stock,
rating,
url,
stockLevel
});
});
return products;
}
// Usage
const products = extractTableData(html);
console.log('Extracted Products:', JSON.stringify(products, null, 2));
// Additional processing examples
const highRatedProducts = products.filter(p => p.rating >= 4.5);
const lowStockProducts = products.filter(p => p.stockLevel === 'low');
const expensiveProducts = products.filter(p => p.price > 100);
console.log(`Found ${highRatedProducts.length} high-rated products`);
console.log(`Found ${lowStockProducts.length} low-stock products`);
console.log(`Found ${expensiveProducts.length} expensive products`);
This comprehensive approach handles complex table structures, extracts various data types, and provides a foundation for any table scraping task with Cheerio.
Guide 2: How to Scrape Form Data and Input Values with Cheerio
Forms are essential components of web pages, containing valuable data like user inputs, selections, and configurations. This guide demonstrates how to extract form data, input values, and form structures using Cheerio's powerful form manipulation capabilities.
Understanding Form Elements
Forms contain various input types that require different extraction approaches:
<form id="user-registration">
<input type="text" name="username" value="john_doe">
<input type="email" name="email" value="john@example.com">
<input type="password" name="password" value="secret123">
<input type="checkbox" name="newsletter" checked>
<select name="country">
<option value="us" selected>United States</option>
<option value="ca">Canada</option>
</select>
<textarea name="bio">Software developer...</textarea>
</form>
Step 1: Setting Up Form Data Extraction
Begin by loading your HTML and understanding the form structure:
import * as cheerio from 'cheerio';
const formHtml = `
<form id="contact-form" method="post" action="/submit">
<div class="form-group">
<label for="name">Full Name:</label>
<input type="text" id="name" name="fullName" value="Jane Smith" required>
</div>
<div class="form-group">
<label for="email">Email:</label>
<input type="email" id="email" name="email" value="jane@company.com">
</div>
<div class="form-group">
<label>Gender:</label>
<input type="radio" name="gender" value="male" id="male">
<label for="male">Male</label>
<input type="radio" name="gender" value="female" id="female" checked>
<label for="female">Female</label>
</div>
<div class="form-group">
<label for="interests">Interests:</label>
<input type="checkbox" name="interests" value="coding" checked>
<label>Coding</label>
<input type="checkbox" name="interests" value="design">
<label>Design</label>
<input type="checkbox" name="interests" value="marketing" checked>
<label>Marketing</label>
</div>
<div class="form-group">
<label for="country">Country:</label>
<select name="country" id="country">
<option value="">Select Country</option>
<option value="us" selected>United States</option>
<option value="uk">United Kingdom</option>
<option value="ca">Canada</option>
</select>
</div>
<div class="form-group">
<label for="message">Message:</label>
<textarea name="message" id="message" rows="4">Hello, I'm interested in your services...</textarea>
</div>
<button type="submit">Submit</button>
</form>
`;
const $ = cheerio.load(formHtml);
Step 2: Extracting Text Input Values
Text inputs, email fields, and passwords can be extracted using the val() method:
// Extract individual input values
const fullName = $('input[name="fullName"]').val();
const email = $('input[name="email"]').val();
console.log('Full Name:', fullName); // "Jane Smith"
console.log('Email:', email); // "jane@company.com"
// Extract all text-type inputs at once
const textInputs: Record<string, string> = {};
$('input[type="text"], input[type="email"], input[type="password"]').each((index, element) => {
const $input = $(element);
const name = $input.attr('name');
const value = $input.val() as string;
if (name) {
textInputs[name] = value || '';
}
});
console.log('Text Inputs:', textInputs);
Step 3: Handling Radio Buttons
Radio buttons require checking which option is selected:
// Get selected radio button value
const selectedGender = $('input[name="gender"]:checked').val();
console.log('Selected Gender:', selectedGender); // "female"
// Get all radio button options for a field
const genderOptions: { value: string; label: string; checked: boolean }[] = [];
$('input[name="gender"]').each((index, element) => {
const $radio = $(element);
const value = $radio.val() as string;
const isChecked = $radio.prop('checked') as boolean;
const label = $radio.next('label').text() || $radio.attr('id') || '';
genderOptions.push({
value,
label: label.trim(),
checked: isChecked
});
});
console.log('Gender Options:', genderOptions);
Step 4: Working with Checkboxes
Checkboxes can have multiple selected values:
// Get all checked checkbox values for a field
const selectedInterests: string[] = [];
$('input[name="interests"]:checked').each((index, element) => {
selectedInterests.push($(element).val() as string);
});
console.log('Selected Interests:', selectedInterests); // ["coding", "marketing"]
// Get detailed checkbox information
const interestOptions = [];
$('input[name="interests"]').each((index, element) => {
const $checkbox = $(element);
const value = $checkbox.val() as string;
const isChecked = $checkbox.prop('checked') as boolean;
const label = $checkbox.next('label').text().trim();
interestOptions.push({
value,
label,
checked: isChecked
});
});
Step 5: Extracting Select Dropdown Values
Select elements require special handling for options:
// Get selected option value
const selectedCountry = $('select[name="country"]').val();
console.log('Selected Country:', selectedCountry); // "us"
// Get selected option text
const selectedCountryText = $('select[name="country"] option:selected').text();
console.log('Selected Country Text:', selectedCountryText); // "United States"
// Get all select options
const countryOptions: { value: string; text: string; selected: boolean }[] = [];
$('select[name="country"] option').each((index, element) => {
const $option = $(element);
const value = $option.val() as string;
const text = $option.text().trim();
const selected = $option.prop('selected') as boolean;
countryOptions.push({ value, text, selected });
});
console.log('Country Options:', countryOptions);
Step 6: Handling Textarea Elements
Textareas contain their content as text, not in a value attribute:
// Extract textarea content
const message = $('textarea[name="message"]').text();
console.log('Message:', message); // "Hello, I'm interested in your services..."
// Alternative method using val()
const messageAlt = $('textarea[name="message"]').val();
console.log('Message (alt):', messageAlt);
Step 7: Form Validation and Metadata Extraction
Extract form metadata and validation rules:
interface FormMetadata {
action: string;
method: string;
enctype?: string;
fieldCount: number;
requiredFields: string[];
}
function extractFormMetadata(formSelector: string): FormMetadata {
const $form = $(formSelector);
const action = $form.attr('action') || '';
const method = ($form.attr('method') || 'get').toLowerCase();
const enctype = $form.attr('enctype');
// Count form fields
const fieldCount = $form.find('input, select, textarea').length;
// Find required fields
const requiredFields: string[] = [];
$form.find('input[required], select[required], textarea[required]').each((index, element) => {
const name = $(element).attr('name');
if (name) requiredFields.push(name);
});
return {
action,
method: method as 'get' | 'post',
enctype,
fieldCount,
requiredFields
};
}
const formMetadata = extractFormMetadata('#contact-form');
console.log('Form Metadata:', formMetadata);
Step 8: Comprehensive Form Data Serialization
Use Cheerio's built-in serialization methods:
// Using Cheerio's serialize method
const serializedData = $('#contact-form').serialize();
console.log('Serialized Form:', serializedData);
// Using serializeArray for structured data
const formArray = $('#contact-form').serializeArray();
console.log('Form Array:', formArray);
// Convert to object format
const formObject: Record<string, string | string[]> = {};
formArray.forEach(field => {
if (formObject[field.name]) {
// Handle multiple values (checkboxes)
if (Array.isArray(formObject[field.name])) {
(formObject[field.name] as string[]).push(field.value);
} else {
formObject[field.name] = [formObject[field.name] as string, field.value];
}
} else {
formObject[field.name] = field.value;
}
});
Complete Working Example
Here's a comprehensive form data extraction solution:
import * as cheerio from 'cheerio';
interface FormData {
textFields: Record<string, string>;
radioButtons: Record<string, string>;
checkboxes: Record<string, string[]>;
selects: Record<string, { value: string; text: string }>;
textareas: Record<string, string>;
metadata: {
action: string;
method: string;
fieldCount: number;
requiredFields: string[];
};
}
const complexFormHtml = `
<form id="survey-form" action="/submit-survey" method="post">
<fieldset>
<legend>Personal Information</legend>
<input type="text" name="firstName" value="John" required>
<input type="text" name="lastName" value="Doe" required>
<input type="email" name="email" value="john.doe@email.com" required>
<input type="tel" name="phone" value="+1-555-123-4567">
</fieldset>
<fieldset>
<legend>Preferences</legend>
<div>
<p>Preferred Contact Method:</p>
<input type="radio" name="contactMethod" value="email" checked>
<label>Email</label>
<input type="radio" name="contactMethod" value="phone">
<label>Phone</label>
<input type="radio" name="contactMethod" value="mail">
<label>Mail</label>
</div>
<div>
<p>Interests (select all that apply):</p>
<input type="checkbox" name="interests" value="technology" checked>
<label>Technology</label>
<input type="checkbox" name="interests" value="sports">
<label>Sports</label>
<input type="checkbox" name="interests" value="music" checked>
<label>Music</label>
<input type="checkbox" name="interests" value="travel">
<label>Travel</label>
</div>
</fieldset>
<fieldset>
<legend>Location</legend>
<select name="country" required>
<option value="">Select Country</option>
<option value="us" selected>United States</option>
<option value="ca">Canada</option>
<option value="uk">United Kingdom</option>
<option value="au">Australia</option>
</select>
<select name="timezone">
<option value="est">Eastern Time</option>
<option value="cst" selected>Central Time</option>
<option value="mst">Mountain Time</option>
<option value="pst">Pacific Time</option>
</select>
</fieldset>
<fieldset>
<legend>Additional Information</legend>
<textarea name="comments" rows="4">Looking forward to hearing from you!</textarea>
<textarea name="suggestions" placeholder="Any suggestions?"></textarea>
</fieldset>
<button type="submit">Submit Survey</button>
<button type="reset">Reset Form</button>
</form>
`;
function extractCompleteFormData(html: string, formSelector: string): FormData {
const $ = cheerio.load(html);
const $form = $(formSelector);
// Extract text fields
const textFields: Record<string, string> = {};
$form.find('input[type="text"], input[type="email"], input[type="tel"], input[type="password"]').each((_, element) => {
const $input = $(element);
const name = $input.attr('name');
const value = $input.val() as string;
if (name) textFields[name] = value || '';
});
// Extract radio buttons
const radioButtons: Record<string, string> = {};
const radioGroups = new Set<string>();
$form.find('input[type="radio"]').each((_, element) => {
const name = $(element).attr('name');
if (name) radioGroups.add(name);
});
radioGroups.forEach(groupName => {
const selectedValue = $form.find(`input[name="${groupName}"]:checked`).val() as string;
if (selectedValue) radioButtons[groupName] = selectedValue;
});
// Extract checkboxes
const checkboxes: Record<string, string[]> = {};
const checkboxGroups = new Set<string>();
$form.find('input[type="checkbox"]').each((_, element) => {
const name = $(element).attr('name');
if (name) checkboxGroups.add(name);
});
checkboxGroups.forEach(groupName => {
const values: string[] = [];
$form.find(`input[name="${groupName}"]:checked`).each((_, element) => {
values.push($(element).val() as string);
});
checkboxes[groupName] = values;
});
// Extract selects
const selects: Record<string, { value: string; text: string }> = {};
$form.find('select').each((_, element) => {
const $select = $(element);
const name = $select.attr('name');
if (name) {
const $selectedOption = $select.find('option:selected');
selects[name] = {
value: $selectedOption.val() as string || '',
text: $selectedOption.text().trim()
};
}
});
// Extract textareas
const textareas: Record<string, string> = {};
$form.find('textarea').each((_, element) => {
const $textarea = $(element);
const name = $textarea.attr('name');
if (name) {
textareas[name] = $textarea.val() as string || '';
}
});
// Extract metadata
const metadata = {
action: $form.attr('action') || '',
method: ($form.attr('method') || 'get').toLowerCase(),
fieldCount: $form.find('input, select, textarea, button').length,
requiredFields: $form.find('[required]').map((_, el) => $(el).attr('name')).get().filter(Boolean)
};
return {
textFields,
radioButtons,
checkboxes,
selects,
textareas,
metadata
};
}
// Usage
const formData = extractCompleteFormData(complexFormHtml, '#survey-form');
console.log('=== EXTRACTED FORM DATA ===');
console.log('Text Fields:', formData.textFields);
console.log('Radio Buttons:', formData.radioButtons);
console.log('Checkboxes:', formData.checkboxes);
console.log('Select Fields:', formData.selects);
console.log('Textareas:', formData.textareas);
console.log('Metadata:', formData.metadata);
// Additional utility functions
function validateFormData(data: FormData): { isValid: boolean; errors: string[] } {
const errors: string[] = [];
// Check required fields
data.metadata.requiredFields.forEach(fieldName => {
if (!data.textFields[fieldName] &&
!data.radioButtons[fieldName] &&
!data.selects[fieldName]?.value) {
errors.push(`Required field '${fieldName}' is missing`);
}
});
return {
isValid: errors.length === 0,
errors
};
}
const validation = validateFormData(formData);
console.log('Validation Result:', validation);
This comprehensive approach handles all common form elements and provides structured data extraction with validation capabilities, making it perfect for form analysis and data processing tasks.
Guide 3: How to Navigate and Modify DOM Elements with Cheerio
DOM navigation and modification are core aspects of web scraping and HTML manipulation. This guide demonstrates how to effectively traverse the DOM tree, find related elements, and modify content using Cheerio's powerful navigation methods.
Understanding DOM Tree Structure
Before diving into navigation techniques, it's important to understand the hierarchical nature of HTML:
<div class="container"> <!-- Parent -->
<header class="main-header"> <!-- First child -->
<h1>Page Title</h1> <!-- Grandchild -->
<nav>Navigation</nav> <!-- Sibling to h1 -->
</header>
<main class="content"> <!-- Next sibling to header -->
<article>Article content</article>
<aside>Sidebar</aside>
</main>
<footer>Footer content</footer> <!-- Last child -->
</div>
Step 1: Basic Element Selection and Navigation
Start with fundamental navigation methods:
import * as cheerio from 'cheerio';
const html = `
<div class="blog-post" data-id="123">
<header class="post-header">
<h1 class="post-title">Understanding Web Scraping</h1>
<div class="post-meta">
<span class="author">John Smith</span>
<time class="published" datetime="2024-01-15">January 15, 2024</time>
<div class="tags">
<span class="tag">web-scraping</span>
<span class="tag">javascript</span>
<span class="tag">cheerio</span>
</div>
</div>
</header>
<div class="post-content">
<p class="intro">Web scraping is a powerful technique...</p>
<p>In this article, we'll explore...</p>
<blockquote>
<p>"Data is the new oil of the digital economy."</p>
<cite>— Industry Expert</cite>
</blockquote>
<p class="conclusion">To summarize...</p>
</div>
<footer class="post-footer">
<div class="social-share">
<button class="share-btn twitter" data-url="https://example.com/post/123">Twitter</button>
<button class="share-btn facebook" data-url="https://example.com/post/123">Facebook</button>
<button class="share-btn linkedin" data-url="https://example.com/post/123">LinkedIn</button>
</div>
<div class="post-actions">
<button class="like-btn" data-likes="42">❤️ 42</button>
<button class="comment-btn" data-comments="8">💬 8</button>
</div>
</footer>
</div>
`;
const $ = cheerio.load(html);
// Basic selection
const postTitle = $('.post-title').text();
console.log('Post Title:', postTitle);
// Navigate to parent
const postHeader = $('.post-title').parent();
console.log('Parent class:', postHeader.attr('class')); // "post-header"
// Navigate to closest ancestor with specific selector
const blogPostContainer = $('.post-title').closest('.blog-post');
console.log('Container data-id:', blogPostContainer.attr('data-id')); // "123"
Step 2: Sibling Navigation
Navigate between sibling elements efficiently:
// Get next sibling
const titleNextSibling = $('.post-title').next();
console.log('Next sibling class:', titleNextSibling.attr('class')); // "post-meta"
// Get all following siblings
const allFollowingSiblings = $('.post-header').nextAll();
console.log('Following siblings count:', allFollowingSiblings.length); // 2
// Get previous sibling
const metaPrevSibling = $('.post-meta').prev();
console.log('Previous sibling tag:', metaPrevSibling.get(0)?.tagName); // "h1"
// Get all preceding siblings
const allPrecedingSiblings = $('.post-footer').prevAll();
console.log('Preceding siblings count:', allPrecedingSiblings.length); // 2
// Get all siblings
const headerSiblings = $('.post-header').siblings();
headerSiblings.each((index, element) => {
console.log(`Sibling ${index + 1}:`, $(element).attr('class'));
});
Step 3: Child and Descendant Navigation
Work with child elements and deep descendants:
// Get direct children
const postContentChildren = $('.post-content').children();
console.log('Direct children count:', postContentChildren.length);
// Get first child
const firstChild = $('.post-content').children().first();
console.log('First child class:', firstChild.attr('class')); // "intro"
// Get last child