Table of Contents
Introduction
Web scraping tables is one of the most common tasks in data extraction. Whether you're collecting financial data, sports statistics, or product information, tables are everywhere on the web. In this comprehensive guide, we'll explore multiple methods to scrape HTML tables using Python.
We'll cover everything from basic static tables to complex dynamic tables loaded via JavaScript, ensuring you have the tools to handle any table scraping scenario.
Prerequisites
Before we begin, make sure you have Python installed on your system. We'll be using several libraries throughout this tutorial:
pip install beautifulsoup4 requests pandas lxml selenium requests-html playwright rich
For Playwright specifically, you'll also need to install the browser binaries:
playwright install
TL;DR - Quick Start
Need to scrape a table fast? Here's the quickest method using Playwright we coded for you:
#!/usr/bin/env python3
"""
Table Scraper CLI - Python Edition
A command-line version that mimics the Chrome extension behavior
"""
# Import necessary libraries for web scraping and data manipulation
from playwright.sync_api import sync_playwright, Page, Locator # Playwright for browser automation
import pandas as pd # For data manipulation and creating DataFrames
from typing import List, Dict, Optional, Tuple, Any # Type hints for better code documentation
import time # For adding delays between operations
import sys # System-specific parameters and functions
import os # Operating system interface
import argparse # For parsing command-line arguments
from datetime import datetime # For timestamp generation
from rich.console import Console # Rich library for beautiful terminal output
from rich.table import Table as RichTable # For displaying tables in terminal
from rich.panel import Panel # For creating styled panels in terminal
from rich.prompt import Prompt, Confirm # For interactive user prompts
from rich.progress import Progress, SpinnerColumn, TextColumn # For progress indicators
import json # For JSON data handling
import hashlib # For generating hashes to detect duplicate content
import re # Regular expressions for pattern matching
from pathlib import Path # Object-oriented filesystem paths
from io import StringIO # String buffer for in-memory file operations
from urllib.parse import urlparse # For parsing URLs to extract domain
# Initialize Rich console for styled terminal output
console = Console()
# Define path for storing domain-specific configurations (mimics Chrome extension's localStorage)
# This file stores saved selectors and settings for each domain
DOMAIN_CONFIG_FILE = Path.home() / ".table_scraper" / "domain_configs.json"
class TableScraperCLI:
"""Main class for the Table Scraper CLI application"""
def __init__(self):
"""Initialize the scraper with default settings"""
self.browser = None # Playwright browser instance
self.page = None # Current page being scraped
self.tables_data = [] # List of found tables with their data and metadata
self.current_table_index = 0 # Index of currently selected table
self.scraped_rows = [] # Accumulated rows from multi-page scraping
self.next_button_selector = None # CSS selector for pagination "next" button
self.scraped_hashes = set() # Set of content hashes for duplicate detection
self.enable_pattern_detection = True # Whether to detect div/list-based tables
self.extract_nested_data = False # Whether to extract links/images from table cells
self.current_domain = None # Domain of the current URL being scraped
self.domain_configs = self.load_domain_configs() # Load saved configurations for domains
def launch_browser(self, url: str, headless: bool = False):
"""Launch browser and navigate to URL
Args:
url: The URL to navigate to
headless: Whether to run browser without GUI (True = no visible browser)
"""
# Start Playwright and launch Chromium browser
playwright = sync_playwright().start()
self.browser = playwright.chromium.launch(headless=headless)
self.page = self.browser.new_page()
# Load any saved configuration for this domain (like saved selectors)
domain_config = self.get_domain_config(url)
# Navigate to the URL and wait for page to fully load
with console.status("[bold green]Loading page...") as status:
self.page.goto(url)
self.page.wait_for_load_state("networkidle") # Wait until network is idle
# Check if we have a previously saved "next" button selector for this domain
if domain_config.get('nextSelector'):
console.print(f"[dim]Found saved config for {self.current_domain}[/dim]")
self.next_button_selector = domain_config['nextSelector']
# Inject CSS styles for visual highlighting of tables and buttons
# These styles help users see which table is selected and which button is the "next" button
self.page.add_style_tag(content="""
.tablescraper-selected-table {
border: 3px solid red !important; /* Red border for selected table */
}
.tablescraper-selected-row {
background-color: rgba(225,227,107,.54) !important; /* Yellow highlight for selected rows */
}
.tablescraper-hover {
background-color: rgba(159,238,155,.44) !important; /* Green highlight on hover */
}
.tablescraper-next-button {
background-color: green !important; /* Green background for next button */
}
/* Backwards compatibility with existing code */
.tds-highlight {
border: 3px solid red !important;
}
.tds-next-button {
background-color: green !important;
}
""")
def find_tables(self) -> List[Dict]:
"""Find all tables on the current page with intelligent scoring
Returns:
List of dictionaries containing table data and metadata
"""
# Calculate the total area of the page body to filter out tiny tables
body_area = self.page.evaluate("""
() => {
const body = document.body;
const rect = body.getBoundingClientRect();
return rect.width * rect.height;
}
""")
min_area = body_area * 0.02 # Tables must be at least 2% of page area
# Find all HTML table elements on the page
tables = self.page.locator('table').all()
self.tables_data = [] # Reset table data list
# Process each table found on the page
for i, table in enumerate(tables):
try:
# Skip hidden tables (display:none, visibility:hidden, etc.)
if not table.is_visible():
continue
# Extract table dimensions and structure information
# This helps us score tables to find the most relevant one
table_info = table.evaluate("""
element => {
const rect = element.getBoundingClientRect(); // Get table dimensions
const rows = element.querySelectorAll('tr').length; // Count table rows
const cols = element.querySelector('tr') ? // Count columns in first row
element.querySelector('tr').querySelectorAll('td, th').length : 0;
return {
width: rect.width,
height: rect.height,
rows: rows,
cols: cols,
area: rect.width * rect.height // Calculate total area
};
}
""")
# Skip tables that are too small (likely navigation or layout tables)
if table_info['area'] < min_area:
continue
# Calculate relevance score: larger tables with more rows score higher
# Formula: area * rowsΒ² gives preference to data-rich tables
score = table_info['area'] * (table_info['rows'] ** 2)
# Extract table data using appropriate method
if self.extract_nested_data:
# Enhanced extraction that captures links and images within cells
df = self.extract_table_with_nested_data(table)
df_valid = not df.empty
else:
# Standard extraction using pandas' built-in HTML parser
table_html = table.evaluate('element => element.outerHTML')
df_list = pd.read_html(StringIO(table_html)) # Parse HTML table
df = df_list[0] if df_list else pd.DataFrame()
df_valid = df_list and not df_list[0].empty
if df_valid:
# Clean up duplicate or sparse columns for better data quality
df = self.smart_column_deduplication(df)
# Store table data and metadata for later use
self.tables_data.append({
'index': i, # Original index on page
'rows': table_info['rows'], # Number of rows
'cols': len(df.columns), # Number of columns after cleaning
'score': score, # Relevance score
'area': table_info['area'], # Visual area on page
'df': df, # Pandas DataFrame with actual data
'element': table # Reference to DOM element
})
except Exception as e:
console.print(f"[yellow]Warning: Could not parse table {i}: {e}[/yellow]")
# Sort tables by relevance score (highest first) and keep only top 5
# This helps users focus on the most likely data tables
self.tables_data = sorted(self.tables_data, key=lambda x: x['score'], reverse=True)[:5]
return self.tables_data
def find_pattern_tables(self) -> List[Dict]:
"""Find div/list-based repeating patterns that look like tables
Many modern websites use divs with CSS instead of HTML tables.
This method detects these patterns and treats them as tables.
Returns:
List of dictionaries containing pattern-based table data
"""
pattern_tables = []
# Common CSS selectors that often contain tabular data
# These patterns are commonly used in modern web design
selectors = [
'div[class*="table"]', # Divs with "table" in class name
'div[class*="grid"]', # Grid layouts
'div[class*="list"]', # List containers
'div[class*="row"]:has(> div)', # Row-based layouts
'ul[class*="list"]', # Unordered lists with data
'.results > *', # Search result containers
'.items > *', # Item containers
'.products > *', # E-commerce product lists
'[role="table"]', # ARIA role for accessibility
'[role="grid"]' # ARIA grid role
]
# Try each selector to find pattern-based tables
for selector in selectors:
try:
# Get up to 5 containers matching each selector
containers = self.page.locator(selector).all()[:5]
for container in containers:
# Skip hidden containers
if not container.is_visible():
continue
# Analyze container for repeating structural patterns
# This JavaScript code runs in the browser to detect patterns
pattern_info = container.evaluate("""
element => {
const children = Array.from(element.children).filter(child =>
!['SCRIPT', 'STYLE', 'META'].includes(child.tagName)
);
if (children.length < 3) return null;
// Count how often each CSS class appears across children
// This helps identify repeating patterns
const classCount = {};
children.forEach(child => {
Array.from(child.classList).forEach(cls => {
classCount[cls] = (classCount[cls] || 0) + 1;
});
});
// Find classes that appear in at least 70% of children
// These are likely the repeating pattern classes
const commonClasses = Object.entries(classCount)
.filter(([cls, count]) => count >= children.length * 0.7)
.map(([cls]) => cls);
if (commonClasses.length === 0) return null;
const rect = element.getBoundingClientRect();
// Estimate the number of "columns" by counting text nodes
// in the first child element
const firstChild = children[0];
const textNodes = [];
// Create a tree walker to find all text nodes
const walker = document.createTreeWalker(
firstChild,
NodeFilter.SHOW_TEXT, // Only look for text nodes
null,
false
);
let node;
while (node = walker.nextNode()) {
if (node.textContent.trim()) {
textNodes.push(node.textContent.trim());
}
}
return {
childCount: children.length,
commonClass: commonClasses[0],
area: rect.width * rect.height,
width: rect.width,
height: rect.height,
estimatedCols: textNodes.length
};
}
""")
if pattern_info and pattern_info['childCount'] >= 3:
# Extract data from pattern
df = self.extract_pattern_data(container, pattern_info)
if df is not None and not df.empty:
score = pattern_info['area'] * (pattern_info['childCount'] ** 2)
pattern_tables.append({
'rows': pattern_info['childCount'],
'cols': pattern_info['estimatedCols'],
'score': score,
'area': pattern_info['area'],
'df': df,
'element': container,
'type': 'pattern',
'pattern_class': pattern_info['commonClass']
})
except Exception as e:
pass
return pattern_tables
def extract_pattern_data(self, container: Locator, pattern_info: Dict) -> Optional[pd.DataFrame]:
"""Extract data from pattern-based structure (div/list layouts)
Args:
container: The container element with repeating patterns
pattern_info: Information about the detected pattern
Returns:
DataFrame with extracted data or None if extraction fails
"""
try:
# Execute JavaScript to extract data from the pattern structure
data = container.evaluate("""
(element, patternClass) => {
const rows = [];
const children = Array.from(element.children).filter(child =>
child.classList.contains(patternClass) || element.children.length < 10
);
children.forEach(child => {
const row = {};
// Extract all text nodes
const texts = [];
const walker = document.createTreeWalker(
child,
NodeFilter.SHOW_TEXT,
null,
false
);
let node;
while (node = walker.nextNode()) {
const text = node.textContent.trim();
if (text && text.length > 0) {
texts.push(text);
}
}
// Extract links
const links = Array.from(child.querySelectorAll('a')).map(a => ({
text: a.textContent.trim(),
href: a.href
}));
// Build row data
texts.forEach((text, i) => {
row[`col_${i}`] = text;
});
// Add link data
links.forEach((link, i) => {
if (link.text && !texts.includes(link.text)) {
row[`link_${i}`] = link.text;
row[`link_${i}_href`] = link.href;
}
});
if (Object.keys(row).length > 0) {
rows.push(row);
}
});
return rows;
}
""", pattern_info.get('commonClass', ''))
if data and len(data) > 0:
return pd.DataFrame(data)
except Exception as e:
console.print(f"[yellow]Error extracting pattern data: {e}[/yellow]")
return None
def extract_table_with_nested_data(self, table_element: Locator) -> pd.DataFrame:
"""Extract table data including nested elements like links and images
This enhanced extraction captures not just text but also:
- Links (href and text)
- Images (src and alt text)
- Other nested elements
Args:
table_element: The table DOM element to extract from
Returns:
DataFrame with extracted data including nested elements
"""
# Execute JavaScript to extract comprehensive table data
table_data = table_element.evaluate("""
element => {
const rows = [];
const trs = element.querySelectorAll('tr');
trs.forEach(tr => {
const row = {};
const cells = tr.querySelectorAll('td, th');
cells.forEach((cell, index) => {
// Extract main text content from the cell
row[`col_${index}`] = cell.textContent.trim();
// Check for and extract links within the cell
const link = cell.querySelector('a');
if (link) {
row[`col_${index}_link`] = link.href; // Link URL
row[`col_${index}_link_text`] = link.textContent.trim(); // Link text
}
// Check for and extract images within the cell
const img = cell.querySelector('img');
if (img) {
row[`col_${index}_img`] = img.src; // Image URL
row[`col_${index}_img_alt`] = img.alt; // Alt text
}
});
if (Object.keys(row).length > 0) {
rows.push(row);
}
});
return rows;
}
""")
if table_data:
df = pd.DataFrame(table_data)
# Clean up column names by removing empty link/img columns
cols_to_keep = []
for col in df.columns:
if not (col.endswith('_link') or col.endswith('_img') or col.endswith('_link_text') or col.endswith('_img_alt')):
cols_to_keep.append(col)
elif df[col].notna().any(): # Keep only if has data
cols_to_keep.append(col)
return df[cols_to_keep]
return pd.DataFrame()
def clear_all_highlights(self):
"""Clear all visual highlighting from the page
Removes all CSS classes used for highlighting tables and rows.
This is called before applying new highlights.
"""
self.page.evaluate("""
() => {
// Remove all highlighting classes from all elements
document.querySelectorAll('*').forEach(el => {
el.classList.remove('tablescraper-selected-table'); // Remove table highlight
el.classList.remove('tablescraper-selected-row'); // Remove row highlight
el.classList.remove('tablescraper-hover'); // Remove hover effect
});
}
""")
def highlight_table(self, table_index: int):
"""Highlight a specific table on the page for visual identification
Args:
table_index: Index of the table in self.tables_data to highlight
"""
# First clear any existing highlights
self.clear_all_highlights()
# Apply highlighting to the selected table
if table_index < len(self.tables_data):
table_data = self.tables_data[table_index]
# Apply visual highlighting to the table element
table_data['element'].evaluate("""
element => {
// Add red border to table
element.classList.add('tablescraper-selected-table');
// Scroll table into view smoothly
element.scrollIntoView({behavior: 'smooth', block: 'center'});
// Highlight all rows within the table
const rows = element.querySelectorAll('tr');
rows.forEach(row => {
row.classList.add('tablescraper-selected-row'); // Yellow background
});
}
""")
# Enable hover effects on table rows
self.enable_hover_effects(table_data['element'])
def enable_hover_effects(self, table_element):
"""Enable row hover effects (port of extension's d function)"""
table_element.evaluate("""
element => {
// Remove any existing hover listeners
if (window.tableHoverListeners) {
window.tableHoverListeners.forEach(({el, fn}) => {
el.removeEventListener('mouseover', fn);
});
window.tableHoverListeners = [];
}
window.tableHoverListeners = [];
// Add hover effects to rows
const rows = element.querySelectorAll('tr');
rows.forEach(row => {
const hoverFn = function(e) {
// Remove hover from all elements
document.querySelectorAll('.tablescraper-hover').forEach(el => {
el.classList.remove('tablescraper-hover');
});
// Add hover to current row
this.classList.add('tablescraper-hover');
};
const leaveFn = function(e) {
this.classList.remove('tablescraper-hover');
};
row.addEventListener('mouseover', hoverFn);
row.addEventListener('mouseleave', leaveFn);
window.tableHoverListeners.push({el: row, fn: hoverFn});
window.tableHoverListeners.push({el: row, fn: leaveFn});
});
}
""")
def calculate_content_hash(self, df: pd.DataFrame) -> str:
"""Calculate SHA256 hash of dataframe content for duplicate detection
Creates a unique fingerprint of the data to detect when we've
scraped the same content (useful for pagination).
Args:
df: DataFrame to hash
Returns:
SHA256 hash string of the data content
"""
# Convert DataFrame to JSON format for hashing
content = df.to_json(orient='records')
# Sort the JSON to ensure consistent hashing regardless of column order
import json
data = json.loads(content)
sorted_content = json.dumps(data, sort_keys=True)
# Generate SHA256 hash of the sorted content
return hashlib.sha256(sorted_content.encode()).hexdigest()
def check_duplicate(self, df: pd.DataFrame) -> bool:
"""Check if this data has already been scraped
Used during pagination to detect when we've reached the end
or are seeing repeated content.
Args:
df: DataFrame to check
Returns:
True if this exact data has been seen before, False otherwise
"""
content_hash = self.calculate_content_hash(df)
# Check if we've seen this exact content before
if content_hash in self.scraped_hashes:
return True # Duplicate found
# Add to set of seen content
self.scraped_hashes.add(content_hash)
return False # New content
def smart_column_deduplication(self, df: pd.DataFrame) -> pd.DataFrame:
"""Apply smart column deduplication to clean up messy tables
This method:
1. Removes columns where all values are identical (no information)
2. Merges duplicate columns that don't overlap
3. Removes sparse columns with too little data
Args:
df: DataFrame to clean
Returns:
Cleaned DataFrame with deduplicated columns
"""
if df.empty:
return df
# Step 1: Remove columns where all values are identical (no useful information)
cols_to_keep = []
for col in df.columns:
unique_values = df[col].dropna().unique()
# Keep column if it has varying values or is completely empty
if len(unique_values) > 1 or len(unique_values) == 0:
cols_to_keep.append(col)
df = df[cols_to_keep]
# Step 2: Handle duplicate column names (often from poorly formatted HTML)
column_counts = {} # Track column name occurrences
new_columns = [] # Build new column names
for col in df.columns:
if col in column_counts:
# Check if duplicate columns can be merged (no overlapping data)
can_merge = True
for idx in df.index:
# If both columns have data at same row, can't merge
if pd.notna(df.loc[idx, col]) and pd.notna(df.iloc[idx, column_counts[col]]):
can_merge = False
break
if can_merge:
# Merge columns - keep existing column index
for idx in df.index:
if pd.notna(df.loc[idx, col]):
df.iloc[idx, column_counts[col]] = df.loc[idx, col]
# Skip this duplicate column
continue
else:
# Add numeric suffix
suffix_num = 2
new_col_name = f"{col} {suffix_num}"
while new_col_name in new_columns:
suffix_num += 1
new_col_name = f"{col} {suffix_num}"
new_columns.append(new_col_name)
else:
column_counts[col] = len(new_columns)
new_columns.append(col)
# Apply new column names
df = df.iloc[:, [i for i, col in enumerate(df.columns) if i < len(new_columns)]]
df.columns = new_columns[:len(df.columns)]
# Step 3: Remove sparse columns (less than 20% data)
# These are often formatting artifacts or empty columns
min_data_ratio = 0.2 # Require at least 20% non-null values
cols_to_keep = []
for col in df.columns:
# Calculate ratio of non-null values
non_null_ratio = df[col].notna().sum() / len(df)
if non_null_ratio >= min_data_ratio:
cols_to_keep.append(col)
return df[cols_to_keep] if cols_to_keep else df
def infer_data_types(self, df: pd.DataFrame) -> pd.DataFrame:
"""Infer and convert data types for better Excel/CSV export
Automatically detects:
- Numbers (including currency and percentages)
- Dates and timestamps
- Boolean values
Args:
df: DataFrame with string data
Returns:
DataFrame with properly typed columns
"""
df_typed = df.copy() # Work on a copy to avoid modifying original
for col in df_typed.columns:
# Skip columns that are already non-string
if df_typed[col].dtype != 'object':
continue
# Try to convert to numeric
try:
# Remove common number formatting characters (commas, dollar signs)
cleaned = df_typed[col].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
# Check if column contains percentage values
if cleaned.str.endswith('%').any():
cleaned = cleaned.str.rstrip('%') # Remove % sign
numeric_vals = pd.to_numeric(cleaned, errors='coerce')
# Convert if at least 50% of values are valid percentages
if numeric_vals.notna().sum() > len(cleaned) * 0.5:
df_typed[col] = numeric_vals / 100 # Convert to decimal (50% -> 0.5)
continue
# Try regular numeric conversion
numeric_vals = pd.to_numeric(cleaned, errors='coerce')
if numeric_vals.notna().sum() > len(cleaned) * 0.8: # At least 80% valid
df_typed[col] = numeric_vals
continue
except:
pass
# Try to convert to datetime
try:
# Only attempt if values look like dates (check first 10 values)
sample = df_typed[col].dropna().astype(str).head(10)
# Common date patterns to check for
date_patterns = [r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD
r'\d{1,2}/\d{1,2}/\d{2,4}', # MM/DD/YYYY or M/D/YY
r'\d{1,2}-\d{1,2}-\d{2,4}', # MM-DD-YYYY
r'\w+ \d{1,2}, \d{4}'] # Month DD, YYYY
if any(sample.str.match(pattern).any() for pattern in date_patterns):
date_vals = pd.to_datetime(df_typed[col], errors='coerce')
if date_vals.notna().sum() > len(df_typed[col]) * 0.5:
df_typed[col] = date_vals
continue
except:
pass
# Check for boolean values
try:
lower_vals = df_typed[col].astype(str).str.lower().str.strip()
# Check if all values are boolean-like
if set(lower_vals.dropna().unique()) <= {'true', 'false', 'yes', 'no', '1', '0', ''}:
# Map string values to actual booleans
bool_map = {'true': True, 'false': False,
'yes': True, 'no': False,
'1': True, '0': False,
'': None} # Empty string becomes None
df_typed[col] = lower_vals.map(bool_map)
continue
except:
pass
return df_typed
def generate_smart_selector(self, element_js: str) -> str:
"""Generate robust CSS selector avoiding dynamic IDs
Creates a CSS selector that will reliably find the element even
if the page structure changes slightly. Avoids IDs with numbers
which are often dynamically generated.
Args:
element_js: JavaScript expression to get the element
Returns:
CSS selector string for the element
"""
selector = self.page.evaluate(f"""
(function() {{
const element = {element_js}; // Get the target element
if (!element) return '';
// Helper function to escape CSS special characters
// This ensures selectors work even with special characters in classes/IDs
function escapeCSS(str, prefix) {{
const escaped = (prefix || '.') + // Use # for IDs, . for classes
str.replace(/[!"#$%&'()*+,.\\/:;<=>?@[\\\\\\]^`{{|}}~]/g, '\\\\$&').trim();
return escaped;
}}
// Build selector from element to root, creating a path
const parts = [];
let current = element;
// Walk up the DOM tree from element to body
while (current && current.tagName &&
current.tagName.toLowerCase() !== 'html' &&
current.tagName.toLowerCase() !== 'body') {{
let selector = current.tagName.toLowerCase();
// Use ID if it exists and doesn't contain numbers
// (numbers often indicate dynamic IDs that change)
if (current.id && current.id.trim() && !current.id.match(/\\d+/)) {{
selector += escapeCSS(current.id, '#');
}}
// Otherwise use CSS classes
else if (current.className && current.className.trim()) {{
// Handle className that might be an object (for SVG elements)
const classStr = typeof current.className === 'string' ?
current.className : current.className.baseVal || '';
if (classStr) {{
// Join multiple classes with dots
selector += escapeCSS(classStr).replace(/\\s+/g, '.');
}}
}}
parts.unshift(selector);
current = current.parentElement;
}}
return parts.join(' > ');
}})();
""")
return selector
def load_domain_configs(self) -> Dict[str, Dict]:
"""Load domain-specific configurations from disk
Loads saved configurations like next button selectors for each domain,
mimicking the Chrome extension's localStorage.
Returns:
Dictionary of domain configurations
"""
if DOMAIN_CONFIG_FILE.exists():
try:
with open(DOMAIN_CONFIG_FILE, 'r') as f:
return json.load(f)
except:
# Return empty dict if file is corrupted
return {}
return {} # No config file exists yet
def save_domain_configs(self):
"""Save domain configurations to disk
Persists configurations so they can be reused in future sessions.
"""
# Create directory if it doesn't exist
DOMAIN_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
# Write configurations as formatted JSON
with open(DOMAIN_CONFIG_FILE, 'w') as f:
json.dump(self.domain_configs, f, indent=2)
def get_domain_config(self, url: str) -> Dict[str, Any]:
"""Get configuration for a specific domain
Args:
url: URL to extract domain from
Returns:
Configuration dictionary for the domain
"""
# Extract domain from URL
parsed = urlparse(url)
hostname = parsed.hostname or parsed.netloc
self.current_domain = hostname
# Create default config if domain is new
if hostname not in self.domain_configs:
self.domain_configs[hostname] = {
'nextSelector': None, # CSS selector for "next" button
'tableSelector': None, # CSS selector for table
'crawlDelay': 1000, # Delay between pages in milliseconds
'maxWait': 20000, # Maximum wait time for page load
'deletedFields': [], # Columns to exclude
'customHeaders': {}, # Custom HTTP headers
'lastUsed': datetime.now().isoformat() # Last access time
}
return self.domain_configs[hostname]
def save_next_selector(self, selector: str):
"""Save next button selector for current domain
Saves the selector so it can be reused next time this domain is scraped.
Args:
selector: CSS selector for the "next" button
"""
if self.current_domain:
# Update configuration
self.domain_configs[self.current_domain]['nextSelector'] = selector
self.domain_configs[self.current_domain]['lastUsed'] = datetime.now().isoformat()
# Persist to disk
self.save_domain_configs()
console.print(f"[green]β Saved selector for {self.current_domain}[/green]")
def save_table_selector(self, selector: str):
"""Save table selector for current domain"""
if self.current_domain:
self.domain_configs[self.current_domain]['tableSelector'] = selector
self.save_domain_configs()
def save_deleted_fields(self, fields: List[str]):
"""Save deleted column names for current domain"""
if self.current_domain:
self.domain_configs[self.current_domain]['deletedFields'] = fields
self.save_domain_configs()
async def find_scrollable_parent(self, element_selector: str) -> Optional[str]:
"""Find the scrollable parent element (port of extension's scroll detection)"""
scrollable_parent = self.page.evaluate(f"""
async () => {{
// Helper to check if element can scroll
const canScroll = async (element) => {{
const originalTop = element.scrollTop;
element.scrollTop = element.scrollTop + 50;
await new Promise(resolve => setTimeout(resolve, 10));
const scrolled = element.scrollTop > originalTop;
element.scrollTop = originalTop;
return scrolled;
}};
let element = document.querySelector('{element_selector}');
// Walk up the DOM tree to find scrollable parent
while (element && element !== document.body) {{
if (await canScroll(element)) {{
// Generate selector for this element
return element.id ? '#' + element.id :
element.className ? '.' + element.className.split(' ')[0] :
element.tagName.toLowerCase();
}}
element = element.parentElement;
}}
// Check body as last resort
if (await canScroll(document.body)) {{
return 'body';
}}
return null;
}}
""")
return scrollable_parent
def lazy_load_scroll(self, container_selector: str, target_selector: str, max_time: int = 30):
"""Implement lazy loading by scrolling (port of extension's N function)"""
console.print(f"[bold yellow]Performing lazy load scrolling...[/bold]")
result = self.page.evaluate(f"""
async () => {{
const container = document.querySelector('{container_selector}');
if (!container) return {{ error: 'Container not found' }};
const targetElements = () => document.querySelectorAll('{target_selector}');
const initialCount = targetElements().length;
let previousCount = initialCount;
let previousScrollTop = container.scrollTop;
let noChangeCount = 0;
const startTime = Date.now();
const maxTime = {max_time} * 1000;
console.log('Starting lazy load scroll. Initial elements:', initialCount);
while (Date.now() - startTime < maxTime) {{
// Scroll down by 1000 pixels
container.scrollTop += 1000;
// Wait for content to load
await new Promise(resolve => setTimeout(resolve, 1000));
const currentCount = targetElements().length;
const currentScrollTop = container.scrollTop;
// Check if new content loaded
if (currentCount > previousCount) {{
console.log('New elements loaded:', currentCount - previousCount);
previousCount = currentCount;
noChangeCount = 0;
}} else if (currentScrollTop === previousScrollTop) {{
// We've reached the bottom
noChangeCount++;
if (noChangeCount >= 3) {{
console.log('Reached bottom of scrollable area');
break;
}}
}}
previousScrollTop = currentScrollTop;
}}
const finalCount = targetElements().length;
return {{
success: true,
initialCount: initialCount,
finalCount: finalCount,
newElements: finalCount - initialCount
}};
}}
""")
if result.get('error'):
console.print(f"[red]Error: {result['error']}[/red]")
else:
console.print(f"[green]β Loaded {result['newElements']} new elements[/green]")
return result
def wait_for_network_idle(self, max_wait: int = 20000, crawl_delay: int = 1000) -> bool:
"""Wait for network requests to complete
Ensures the page is fully loaded before scraping, especially important
for pages that load data via AJAX.
Args:
max_wait: Maximum time to wait in milliseconds
crawl_delay: Additional delay after network idle in milliseconds
Returns:
True if network became idle, False if timeout
"""
console.print("[dim]Waiting for network activity to complete...[/dim]")
try:
# Wait for network to be idle (no requests for 500ms)
self.page.wait_for_load_state("networkidle", timeout=max_wait)
# Additional wait to ensure dynamic content is rendered
time.sleep(crawl_delay / 1000)
return True
except Exception as e:
console.print(f"[yellow]Network wait timeout: {e}[/yellow]")
return False
def smart_page_wait(self, action_fn=None, max_wait: int = 20000, crawl_delay: int = 1000):
"""Smart page waiting with network monitoring
Intelligently waits for page to be ready after an action (like clicking next).
Combines DOM readiness, network idle, and safety delays.
Args:
action_fn: Optional function to execute (e.g., click next button)
max_wait: Maximum wait time in milliseconds
crawl_delay: Additional safety delay in milliseconds
Returns:
True if page settled successfully
"""
# Step 1: Execute action if provided (e.g., click next button)
if action_fn:
action_fn()
# Step 2: Wait for DOM to be ready
try:
self.page.wait_for_load_state("domcontentloaded", timeout=max_wait // 4)
except:
pass # Continue even if timeout
# Step 3: Wait for network requests to complete
network_settled = self.wait_for_network_idle(max_wait, crawl_delay)
# Step 4: Additional safety wait for JavaScript rendering
time.sleep(crawl_delay / 1000)
return network_settled
def row_by_row_scroll(self, table_selector: str):
"""Scroll through table rows one by one (port of extension's E function)"""
self.page.evaluate(f"""
() => {{
const table = document.querySelector('{table_selector}');
if (!table) return;
const rows = table.querySelectorAll('tr');
let delay = 50; // Default 50ms between rows
// Adjust delay based on row count (max 3 seconds total)
if (rows.length * delay > 3000) {{
delay = Math.max(10, 3000 / rows.length);
}}
console.log('Scrolling through', rows.length, 'rows with', delay, 'ms delay');
let index = 0;
const scrollInterval = setInterval(() => {{
if (index >= rows.length || index * delay > 10000) {{
clearInterval(scrollInterval);
return;
}}
rows[index].scrollIntoView({{ behavior: 'smooth', block: 'center' }});
index++;
}}, delay);
}}
""")
def display_table_preview(self, df: pd.DataFrame, max_rows: int = 10):
"""Display a preview of the dataframe in the terminal
Args:
df: DataFrame to display
max_rows: Maximum number of rows to show
"""
# Create a Rich table for beautiful terminal output
rich_table = RichTable(title="Table Preview", show_lines=True)
# Add column headers
for col in df.columns:
rich_table.add_column(str(col), style="cyan", no_wrap=False)
# Add data rows (limited to max_rows)
for _, row in df.head(max_rows).iterrows():
# Convert all values to strings for display
rich_table.add_row(*[str(val) for val in row.values])
console.print(rich_table)
if len(df) > max_rows:
console.print(f"[dim]... and {len(df) - max_rows} more rows[/dim]")
def select_table(self):
"""Let user select which table to scrape
In interactive mode, allows cycling through found tables.
In non-interactive mode, auto-selects the highest scoring table.
Returns:
Index of selected table or None if cancelled
"""
if not self.tables_data:
console.print("[red]No tables found on this page![/red]")
return None
# In non-interactive mode (piped input), auto-select the best table
if not sys.stdin.isatty():
current = self.tables_data[0] # Highest scoring table
# Show what type of table was found
table_type = "π HTML Table" if current.get('type') != 'pattern' else "π Pattern-based"
console.print(f"\n[bold]Auto-selected: {table_type}[/bold]")
console.print(f"Rows: {current['rows']}, Columns: {current['cols']}")
if 'score' in current:
console.print(f"Score: {current['score']:,.0f}")
# Highlight the table on the page
self.highlight_table(0)
# Show preview of the data
self.display_table_preview(current['df'])
return 0
# Interactive mode - let user cycle through tables
while True:
# Get current table being previewed
current = self.tables_data[self.current_table_index]
# Display table information
table_type = "π HTML Table" if current.get('type') != 'pattern' else "π Pattern-based"
console.print(f"\n[bold]{table_type} {self.current_table_index + 1} of {len(self.tables_data)}[/bold]")
console.print(f"Rows: {current['rows']}, Columns: {current['cols']}")
if 'score' in current:
console.print(f"Score: {current['score']:,.0f} (larger tables with more data score higher)")
self.highlight_table(self.current_table_index)
self.display_table_preview(current['df'])
# Ask user what to do
console.print("\n[bold]Options:[/bold]")
console.print("1. Use this table")
console.print("2. Try another table")
console.print("3. Cancel")
choice = Prompt.ask("Select", choices=["1", "2", "3"])
if choice == "1":
return self.current_table_index
elif choice == "2":
self.current_table_index = (self.current_table_index + 1) % len(self.tables_data)
else:
return None
def locate_next_button(self):
"""Interactive next button selection
Allows user to click on the "next" button in the browser,
which will be captured and saved for pagination.
Returns:
True if button was successfully located, False otherwise
"""
console.print("\n[bold yellow]Click on the 'Next' button or pagination link in the browser[/bold]")
console.print("[dim]The button will be highlighted in green when clicked[/dim]")
# Inject JavaScript to intercept clicks and identify the next button
self.page.evaluate("""
window.tdsNextButton = null; // Store the clicked element
window.tdsClickHandler = function(e) {
e.preventDefault(); // Prevent default action
e.stopPropagation(); // Stop event bubbling
// Remove any previous highlight
document.querySelectorAll('.tds-next-button').forEach(el => {
el.classList.remove('tds-next-button');
});
// Highlight the clicked element in green
e.target.classList.add('tds-next-button');
window.tdsNextButton = e.target; // Store reference
// Generate a basic selector for the clicked element
let selector = '';
if (e.target.id) {
selector = '#' + e.target.id; // Use ID if available
} else if (e.target.className) {
// Use class names (excluding our highlight class)
selector = '.' + e.target.className.split(' ').filter(c => c !== 'tds-next-button').join('.');
} else {
// Fall back to tag name and text content
selector = e.target.tagName.toLowerCase();
if (e.target.textContent) {
selector += ':has-text("' + e.target.textContent.trim() + '")';
}
}
window.tdsNextSelector = selector; // Store basic selector
window.tdsNextElement = e.target; // Store element reference
return false;
};
document.addEventListener('click', window.tdsClickHandler, true);
""")
# Wait for user to click on the next button (30 second timeout)
start_time = time.time()
while time.time() - start_time < 30:
# Check if user has clicked something
next_button = self.page.evaluate("window.tdsNextButton")
if next_button:
# Generate a robust selector that will work across pages
smart_selector = self.generate_smart_selector("window.tdsNextElement")
if smart_selector:
self.next_button_selector = smart_selector
else:
# Fall back to the basic selector if smart generation fails
self.next_button_selector = self.page.evaluate("window.tdsNextSelector")
# Remove event listener
self.page.evaluate("""
document.removeEventListener('click', window.tdsClickHandler, true);
""")
console.print(f"\n[green]β Next button located: {self.next_button_selector}[/green]")
# Save selector for future use
self.save_next_selector(self.next_button_selector)
return True
time.sleep(0.1)
# Timeout
self.page.evaluate("""
document.removeEventListener('click', window.tdsClickHandler, true);
""")
console.print("[red]Timeout waiting for next button selection[/red]")
return False
def crawl_pages(self, max_pages: int = 10, delay: int = 1):
"""Crawl multiple pages using pagination
Args:
max_pages: Maximum number of pages to crawl
delay: Delay between pages in seconds
Returns:
Tuple of (combined DataFrame, number of pages scraped)
"""
pages_scraped = 1 # Start with current page
# Get initial data from selected table
df = self.tables_data[self.current_table_index]['df']
all_data = df.to_dict('records') # Convert to list of dicts
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Crawling pages...", total=max_pages)
while pages_scraped < max_pages:
try:
# Get domain-specific timing configuration
domain_config = self.domain_configs.get(self.current_domain, {})
max_wait = domain_config.get('maxWait', 20000) # Max wait time
crawl_delay = domain_config.get('crawlDelay', delay * 1000) # Delay between pages
# Define function to click the next button
def click_next():
self.page.click(self.next_button_selector)
# Click next and wait intelligently for page to load
self.smart_page_wait(click_next, max_wait, crawl_delay)
# Find tables on the newly loaded page
tables = self.page.locator('table').all()
if not tables:
console.print("[yellow]No tables found on new page[/yellow]")
break
# Try to find the same table position, or use first available
table_index = min(self.current_table_index, len(tables) - 1)
# Extract HTML of the target table
table_html = tables[table_index].evaluate('element => element.outerHTML')
# Parse the table HTML into a DataFrame
df_list = pd.read_html(StringIO(table_html))
if df_list and not df_list[0].empty:
# Check if this is duplicate content (end of pagination)
if self.check_duplicate(df_list[0]):
console.print("[yellow]β οΈ Duplicate content detected, stopping crawl[/yellow]")
break
# Add new rows to our collection
new_rows = df_list[0].to_dict('records')
all_data.extend(new_rows)
pages_scraped += 1
progress.update(task, advance=1,
description=f"Scraped {pages_scraped} pages, {len(all_data)} rows")
else:
console.print("[yellow]No data found on new page[/yellow]")
break
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
break
return pd.DataFrame(all_data), pages_scraped
def save_data(self, df: pd.DataFrame):
"""Save scraped data with intelligent type preservation
Offers multiple export formats and can automatically detect
and preserve data types.
Args:
df: DataFrame to save
"""
console.print("\n[bold]Export Options:[/bold]")
console.print("1. CSV") # Standard comma-separated values
console.print("2. Excel (XLSX) - with data types preserved") # Excel with formatting
console.print("3. JSON") # JavaScript Object Notation
console.print("4. Copy to clipboard") # Direct copy for pasting
choice = Prompt.ask("Select format", choices=["1", "2", "3", "4"])
# Ask if user wants automatic type detection
preserve_types = Confirm.ask("Detect and preserve data types (numbers, dates, etc)?", default=True)
if preserve_types:
console.print("[dim]Detecting data types...[/dim]")
# Apply intelligent type inference
df_typed = self.infer_data_types(df)
# Show user what data types were detected
type_changes = []
for col in df.columns:
# Check if column type changed
if df[col].dtype != df_typed[col].dtype:
type_changes.append(f"{col}: {df[col].dtype} β {df_typed[col].dtype}")
if type_changes:
console.print("[green]Detected types:[/green]")
for change in type_changes[:5]: # Show first 5
console.print(f" {change}")
if len(type_changes) > 5:
console.print(f" ... and {len(type_changes) - 5} more")
df = df_typed
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if choice == "1":
# Export as CSV
filename = f"scraped_data_{timestamp}.csv"
df.to_csv(filename, index=False) # Don't include row indices
console.print(f"[green]β Saved to {filename}[/green]")
elif choice == "2":
# Export as Excel with formatting
filename = f"scraped_data_{timestamp}.xlsx"
# Use Excel writer with openpyxl engine for formatting support
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Data')
# Auto-adjust column widths for better readability
worksheet = writer.sheets['Data']
for idx, col in enumerate(df.columns):
# Calculate optimal width based on content
max_length = max(
df[col].astype(str).map(len).max(), # Max data length
len(str(col)) # Column header length
) + 2 # Add padding
# Set width (max 50 to prevent too wide columns)
worksheet.column_dimensions[chr(65 + idx)].width = min(max_length, 50)
console.print(f"[green]β Saved to {filename} with formatting[/green]")
elif choice == "3":
filename = f"scraped_data_{timestamp}.json"
# JSON with date handling
df.to_json(filename, orient='records', indent=2, date_format='iso')
console.print(f"[green]β Saved to {filename}[/green]")
elif choice == "4":
df.to_clipboard(index=False, sep='\t')
console.print("[green]β Copied to clipboard[/green]")
def run(self, url: str, headless_override: Optional[bool] = None):
"""Main execution flow for interactive scraping
Args:
url: URL to scrape
headless_override: Force headless mode if specified
"""
try:
# Determine browser mode (headless or visible)
if headless_override is not None:
headless = headless_override
console.print(f"[dim]Running in {'headless' if headless else 'headed'} mode (from command line)[/dim]")
else:
# Ask user preference
headless = Confirm.ask("Run browser in headless mode?", default=True)
# Launch browser and navigate to URL
self.launch_browser(url, headless)
# Ask about advanced features (only in interactive mode)
if sys.stdin.isatty():
self.extract_nested_data = Confirm.ask("Extract links and images from table cells?", default=False)
else:
# Use command line flags or defaults in non-interactive mode
console.print(f"[dim]Using default settings (extract nested: {self.extract_nested_data})[/dim]")
# Find tables
console.print("\n[bold]Searching for tables...[/bold]")
tables = self.find_tables()
# Also find pattern-based tables if enabled
if self.enable_pattern_detection:
console.print("[dim]Also checking for div/list-based tables...[/dim]")
pattern_tables = self.find_pattern_tables()
if pattern_tables:
tables.extend(pattern_tables)
# Re-sort all by score
self.tables_data = sorted(tables, key=lambda x: x['score'], reverse=True)[:5]
if not self.tables_data:
console.print("[red]No tables found on this page![/red]")
return
console.print(f"[green]Found {len(self.tables_data)} table candidate(s)[/green]")
# Show what types were found
html_tables = sum(1 for t in self.tables_data if t.get('type') != 'pattern')
pattern_tables = sum(1 for t in self.tables_data if t.get('type') == 'pattern')
if pattern_tables > 0:
console.print(f"[dim]({html_tables} HTML tables, {pattern_tables} pattern-based)[/dim]")
# Select table
table_index = self.select_table()
if table_index is None:
return
# Ask if user wants to scrape multiple pages
if Confirm.ask("\nDo you want to crawl multiple pages?", default=False):
# Determine crawling method: infinite scroll or pagination
use_infinite_scroll = Confirm.ask("Use infinite scroll instead of pagination?", default=False)
if use_infinite_scroll:
# Infinite scroll mode - for pages that load more data when scrolling
# Generate selector for the current table
table_selector = self.generate_smart_selector("arguments[0]")
# Store selector in browser for later use
self.page.evaluate("(el, sel) => window._tableSelector = sel",
self.tables_data[table_index]['element'], table_selector)
# Try to find the scrollable container element
scrollable = self.page.evaluate("""
async () => {
const table = document.querySelector(window._tableSelector);
if (!table) return null;
// Common CSS patterns for scrollable containers
const patterns = [
'.results-container', '.scroll-container', '.data-container',
'[class*="scroll"]', '[class*="results"]', 'main', 'body'
];
for (const pattern of patterns) {
const container = table.closest(pattern);
if (container) return pattern;
}
return 'body';
}
""")
if scrollable:
# Perform lazy loading
self.lazy_load_scroll(scrollable, table_selector)
# Re-find tables after scrolling
self.find_tables()
if self.tables_data:
df = self.tables_data[0]['df'] # Use the first/largest table
else:
df = self.tables_data[table_index]['df']
else:
console.print("[yellow]Could not find scrollable container[/yellow]")
df = self.tables_data[table_index]['df']
else:
# Traditional pagination mode - clicking "next" button
need_to_locate = True # Flag to determine if we need to find the button
# Check if we have a previously saved "next" button selector for this domain
if self.next_button_selector:
use_saved = Confirm.ask(f"Use saved 'Next' selector for {self.current_domain}?", default=True)
if use_saved:
need_to_locate = False # Don't need to locate, use saved
console.print(f"[green]Using saved selector: {self.next_button_selector}[/green]")
if need_to_locate and self.locate_next_button():
max_pages = int(Prompt.ask("Maximum pages to crawl", default="10"))
delay = int(Prompt.ask("Delay between pages (seconds)", default="1"))
elif not need_to_locate and self.next_button_selector:
max_pages = int(Prompt.ask("Maximum pages to crawl", default="10"))
delay = int(Prompt.ask("Delay between pages (seconds)", default="1"))
df, pages = self.crawl_pages(max_pages, delay)
console.print(f"\n[green]β Crawled {pages} pages, collected {len(df)} rows[/green]")
else:
df = self.tables_data[table_index]['df']
else:
df = self.tables_data[table_index]['df']
# Display final data
console.print("\n[bold]Final Data Preview:[/bold]")
self.display_table_preview(df, max_rows=20)
# Save data
if Confirm.ask("\nSave the data?", default=True):
self.save_data(df)
except KeyboardInterrupt:
console.print("\n[yellow]Interrupted by user[/yellow]")
except Exception as e:
console.print(f"\n[red]Error: {e}[/red]")
finally:
if self.browser:
self.browser.close()
def run_automated(self, url: str, headless: bool = True):
"""Automated run for command line usage with minimal interaction
This method is designed for non-interactive use cases where
the scraper automatically selects the best table and saves it.
Args:
url: URL to scrape
headless: Whether to run browser in headless mode
"""
try:
# Launch browser in specified mode
self.launch_browser(url, headless)
# Find tables with intelligent detection
console.print("\n[bold]Searching for tables...[/bold]")
tables = self.find_tables()
# Also find pattern-based tables if enabled
if self.enable_pattern_detection:
console.print("[dim]Also checking for div/list-based tables...[/dim]")
pattern_tables = self.find_pattern_tables()
if pattern_tables:
tables.extend(pattern_tables)
# Re-sort all by score
self.tables_data = sorted(tables, key=lambda x: x['score'], reverse=True)[:5]
if not self.tables_data:
console.print("[red]No tables found on this page![/red]")
return
console.print(f"[green]Found {len(self.tables_data)} table candidate(s)[/green]")
# Show what types were found
html_tables = sum(1 for t in self.tables_data if t.get('type') != 'pattern')
pattern_tables = sum(1 for t in self.tables_data if t.get('type') == 'pattern')
if pattern_tables > 0:
console.print(f"[dim]({html_tables} HTML tables, {pattern_tables} pattern-based)[/dim]")
# Auto-select the highest scoring table
table_index = 0
candidate = self.tables_data[table_index]
table_type = "π HTML Table" if candidate.get('type') != 'pattern' else "π Pattern-based"
console.print(f"\n[bold]Auto-selected: {table_type}[/bold]")
console.print(f"Rows: {candidate['rows']}, Columns: {candidate['cols']}")
if 'score' in candidate:
console.print(f"Score: {candidate['score']:,.0f}")
self.highlight_table(table_index)
# Extract data
df = candidate['df']
# Display preview
console.print("\n[bold]Data Preview:[/bold]")
self.display_table_preview(df, max_rows=10)
# Auto-save as CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"scraped_data_{timestamp}.csv"
df.to_csv(filename, index=False)
console.print(f"\n[green]β Auto-saved to {filename}[/green]")
console.print(f"[dim]Scraped {len(df)} rows Γ {len(df.columns)} columns[/dim]")
except KeyboardInterrupt:
console.print("\n[yellow]Interrupted by user[/yellow]")
except Exception as e:
console.print(f"\n[red]Error: {e}[/red]")
finally:
if self.browser:
self.browser.close()
def main():
"""Main entry point for the Table Scraper CLI"""
# Set up command line argument parser
parser = argparse.ArgumentParser(description='TableScraper - Python Edition')
parser.add_argument('url', nargs='?', help='URL to scrape') # Optional URL argument
parser.add_argument('--headless', action='store_true', help='Run browser in headless mode') # Force headless
parser.add_argument('--extract-nested', action='store_true', help='Extract links and images from cells') # Enhanced extraction
parser.add_argument('--no-patterns', action='store_true', help='Disable pattern detection') # HTML tables only
args = parser.parse_args()
# Display welcome banner
console.print(Panel.fit(
"[bold blue]TableScraper - Python Edition[/bold blue]\n"
"A Playwright-powered table scraper",
border_style="blue"
))
# Get URL from command line or prompt user
if args.url:
# URL provided as command line argument
url = args.url
console.print(f"\n[green]Using URL from command line: {url}[/green]")
else:
# No URL provided, need to get it
if sys.stdin.isatty(): # Check if running interactively
# Prompt user for URL with a default example
url = Prompt.ask("\nEnter URL to scrape",
default="https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue")
else:
# Non-interactive mode without URL - show error
console.print("\n[red]No URL provided. Use: python table_scraper_cli.py [/red]")
sys.exit(1)
# Create scraper instance and apply command line options
scraper = TableScraperCLI()
if args.no_patterns:
# Disable detection of div/list-based tables
scraper.enable_pattern_detection = False
if args.extract_nested:
# Enable extraction of links and images from table cells
scraper.extract_nested_data = True
# Run the scraper
try:
# Only force headless mode if --headless flag was used
headless_override = args.headless if args.headless else None
# Execute main scraping workflow
scraper.run(url, headless_override=headless_override)
except KeyboardInterrupt:
# Handle Ctrl+C gracefully
console.print("\n[yellow]Interrupted by user[/yellow]")
except Exception as e:
# Display any other errors
console.print(f"\n[red]Error: {e}[/red]")
# Entry point when script is run directly
if __name__ == "__main__":
main() # Execute main function
For more complex scenarios: Use BeautifulSoup for custom parsing, Selenium for JavaScript-heavy sites, or requests-html for basic dynamic content. See the full guide below.
Method 1: Using BeautifulSoup
BeautifulSoup is the most popular library for web scraping in Python. It provides a simple and intuitive API for parsing HTML and extracting data.
Basic Table Scraping
Let's start with a simple example of scraping a table from a webpage:
# Import required libraries for web scraping and data manipulation
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Send a GET request to the webpage containing the table
url = "https://example.com/table-page"
response = requests.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the first table element on the page
table = soup.find('table')
# Extract column headers from the table
headers = []
for th in table.find_all('th'): # Find all header cells
headers.append(th.text.strip()) # Get text and remove whitespace
# Extract data rows from the table
rows = []
for tr in table.find_all('tr')[1:]: # Skip first row (headers)
row = []
for td in tr.find_all('td'): # Find all data cells in the row
row.append(td.text.strip()) # Get text and remove whitespace
if row: # Only append non-empty rows to avoid blank entries
rows.append(row)
# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(rows, columns=headers)
print(df) # Display the scraped table data
Handling Multiple Tables
When a page contains multiple tables, you can select specific tables using various methods:
# Method 1: Select table by index position
tables = soup.find_all('table') # Get all tables on the page
second_table = tables[1] # Get the second table (0-indexed)
# Method 2: Select table by CSS class name
table = soup.find('table', {'class': 'data-table'})
# Method 3: Select table by unique ID attribute
table = soup.find('table', {'id': 'financial-data'})
# Method 4: Use CSS selector for nested elements
table = soup.select_one('div.container table.results')
Method 2: Using pandas read_html()
For simple table extraction, pandas provides a convenient read_html() function that can automatically detect and parse tables from HTML:
# Import pandas library for easy table reading
import pandas as pd
# Read all tables directly from a URL (automatically detects tables)
url = "https://example.com/table-page"
tables = pd.read_html(url) # Returns a list of DataFrames
# Display information about the tables found
print(f"Found {len(tables)} tables")
df = tables[0] # Select the first table from the list
print(df.head()) # Show the first 5 rows
# Alternative: Pass HTML content directly as a string
html_content = "<table><tr><th>Name</th><th>Age</th></tr><tr><td>John</td><td>25</td></tr></table>"
df = pd.read_html(html_content)[0] # Parse HTML string and get first table
print(df) # Display the resulting DataFrame
Advanced Options with read_html()
The read_html() function offers several parameters for more control:
# Filter tables by HTML attributes (only get tables with specific class)
df = pd.read_html(url, attrs={'class': 'wikitable'})[0]
# Use the first column as the DataFrame index
df = pd.read_html(url, index_col=0)[0]
# Skip the first 2 rows of the table (useful for headers)
df = pd.read_html(url, skiprows=2)[0]
# Automatically parse date columns into datetime objects
df = pd.read_html(url, parse_dates=True)[0]
# Handle numbers with thousands separators (e.g., "1,000")
df = pd.read_html(url, thousands=',')[0]
Method 3: Using requests-html
The requests-html library combines the simplicity of requests with JavaScript support, making it ideal for modern web pages:
# Import libraries for JavaScript-enabled web scraping
from requests_html import HTMLSession
import pandas as pd
# Create a new HTML session for making requests
session = HTMLSession()
# Get the webpage and render any JavaScript content
r = session.get('https://example.com/dynamic-table')
r.html.render() # Execute JavaScript to load dynamic content
# Find all table elements on the rendered page
tables = r.html.find('table')
# Extract data from the first table only
data = []
for table in tables[:1]: # Process only the first table
# Extract column headers from the table
headers = [th.text for th in table.find('th')]
# Extract data from each row (skip header row)
for tr in table.find('tr')[1:]:
row = [td.text for td in tr.find('td')] # Get text from each cell
if row: # Only add non-empty rows
data.append(row)
# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=headers)
print(df) # Display the scraped table
Handling Dynamic Tables with Selenium
For tables loaded dynamically via JavaScript or requiring user interaction, Selenium is the go-to solution:
# Import Selenium components for browser automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# Initialize Chrome browser driver
driver = webdriver.Chrome()
try:
# Navigate to the webpage containing the dynamic table
driver.get("https://example.com/dynamic-table")
# Wait up to 10 seconds for the table to load
wait = WebDriverWait(driver, 10)
table = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "table.data-table"))
)
# Extract column headers from the table
headers = [th.text for th in table.find_elements(By.TAG_NAME, "th")]
# Extract data rows from the table
rows = []
for tr in table.find_elements(By.TAG_NAME, "tr")[1:]: # Skip header row
row = [td.text for td in tr.find_elements(By.TAG_NAME, "td")]
if row: # Only add non-empty rows
rows.append(row)
# Create a pandas DataFrame from the scraped data
df = pd.DataFrame(rows, columns=headers)
print(df) # Display the final table
finally:
# Always close the browser to free up resources
driver.quit()
Handling Pagination
Many tables are paginated. Here's how to scrape all pages:
# Initialize list to store data from all pages
all_data = []
# Navigate to the first page of the paginated table
driver.get("https://example.com/paginated-table")
# Loop through all pages until no more pages are available
while True:
# Wait for the table to load on the current page
table = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "table"))
)
# Extract data from the current page
for tr in table.find_elements(By.TAG_NAME, "tr")[1:]: # Skip header
row = [td.text for td in tr.find_elements(By.TAG_NAME, "td")]
if row: # Only add non-empty rows
all_data.append(row)
# Attempt to navigate to the next page
try:
next_button = driver.find_element(By.CSS_SELECTOR, "a.next-page")
# Check if the next button is disabled (last page)
if "disabled" in next_button.get_attribute("class"):
break # Exit loop if on the last page
next_button.click() # Click to go to next page
time.sleep(2) # Wait for the new page to load
except:
break # Exit if next button not found (no more pages)
# Create final DataFrame with all collected data
df = pd.DataFrame(all_data, columns=headers)
Best Practices and Tips
1. Error Handling
Always implement proper error handling to make your scraper more robust:
# Import libraries for timing and error handling
import time
from requests.exceptions import RequestException
def scrape_table_with_retry(url, max_retries=3):
"""Scrape a table with retry logic and exponential backoff"""
for attempt in range(max_retries):
try:
# Make HTTP request with timeout
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise exception for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table')
# Validate that a table was found
if not table:
raise ValueError("No table found on the page")
# Process the table and return data
return extract_table_data(table)
except RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
# Wait before retrying (exponential backoff: 1s, 2s, 4s)
time.sleep(2 ** attempt)
else:
# Re-raise the exception if all retries failed
raise
2. Respect Robots.txt
Always check and respect the website's robots.txt file:
# Import library for parsing robots.txt files
from urllib.robotparser import RobotFileParser
def can_fetch(url):
"""Check if a URL can be scraped according to robots.txt"""
# Create a robot file parser instance
rp = RobotFileParser()
# Set the URL to the site's robots.txt file
rp.set_url(url + "/robots.txt")
# Read and parse the robots.txt file
rp.read()
# Check if any user agent (*) can fetch the given URL
return rp.can_fetch("*", url)
3. Add Headers
Use appropriate headers to avoid being blocked:
# Define HTTP headers to mimic a real browser request
headers = {
# Identify as a real browser to avoid blocking
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
# Specify accepted content types
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# Set preferred language
'Accept-Language': 'en-US,en;q=0.5',
# Specify accepted encoding methods
'Accept-Encoding': 'gzip, deflate',
# Keep connection alive for better performance
'Connection': 'keep-alive',
}
# Make request with the custom headers
response = requests.get(url, headers=headers)
4. Rate Limiting
Implement rate limiting to avoid overwhelming the server:
# Import libraries for timing and efficient queue operations
import time
from collections import deque
class RateLimiter:
"""Rate limiter to control request frequency"""
def __init__(self, calls, period):
self.calls = calls # Maximum number of calls allowed
self.period = period # Time period in seconds
self.timestamps = deque() # Store timestamps of recent calls
def wait_if_needed(self):
"""Wait if necessary to respect rate limits"""
now = time.time()
# Remove timestamps older than the current period
while self.timestamps and self.timestamps[0] < now - self.period:
self.timestamps.popleft()
# If we've hit the rate limit, wait until we can make another call
if len(self.timestamps) >= self.calls:
sleep_time = self.period - (now - self.timestamps[0])
if sleep_time > 0:
time.sleep(sleep_time) # Wait before proceeding
# Record the timestamp of this call
self.timestamps.append(time.time())
# Create a rate limiter: 10 requests per minute
rate_limiter = RateLimiter(10, 60)
# Use the rate limiter when making multiple requests
for url in urls:
rate_limiter.wait_if_needed() # Ensure we don't exceed rate limits
response = requests.get(url)
Common Issues and Solutions
1. Nested Tables
Handle tables within tables carefully:
# Find the main table by its CSS class
main_table = soup.find('table', {'class': 'main-table'})
# Get only direct child rows (exclude rows from nested tables)
rows = main_table.find_all('tr', recursive=False)
2. Colspan and Rowspan
Handle cells that span multiple columns or rows:
def parse_table_with_spans(table):
"""Parse a table that contains cells with colspan and rowspan attributes"""
# Get all table rows
rows = table.find_all('tr')
# Calculate maximum number of columns needed
max_cols = max(sum(int(td.get('colspan', 1)) for td in row.find_all(['td', 'th']))
for row in rows)
# Create a 2D grid to represent the table structure
grid = [[None for _ in range(max_cols)] for _ in range(len(rows))]
# Process each row and cell
for row_idx, row in enumerate(rows):
col_idx = 0
for cell in row.find_all(['td', 'th']):
# Skip cells that are already occupied by spanning cells
while col_idx < max_cols and grid[row_idx][col_idx] is not None:
col_idx += 1
# Get colspan and rowspan values (default to 1)
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
# Fill the grid for all cells covered by this spanning cell
for r in range(rowspan):
for c in range(colspan):
if row_idx + r < len(rows) and col_idx + c < max_cols:
grid[row_idx + r][col_idx + c] = cell.text.strip()
# Move to the next column position
col_idx += colspan
return grid # Return the completed grid structure
3. JavaScript-Rendered Content
For content loaded after page load:
# Wait up to 20 seconds for specific content to appear
wait = WebDriverWait(driver, 20)
wait.until(
# Wait for a table with a specific data attribute indicating it's loaded
EC.presence_of_element_located((By.XPATH, "//table[@data-loaded='true']"))
)
# Alternative: Check if all JavaScript has finished executing
driver.execute_script("return document.readyState") == "complete"
Conclusion
Web scraping tables in Python can be approached in multiple ways, each with its own advantages:
- BeautifulSoup: Best for static HTML tables with complex parsing requirements
- pandas.read_html(): Quickest solution for simple, well-formatted tables
- requests-html: Good balance between simplicity and JavaScript support
- Selenium: Essential for complex dynamic content and user interaction
Choose the method that best fits your specific use case, and always remember to scrape responsibly by respecting robots.txt, implementing rate limiting, and following the website's terms of service.
| Method | Speed | JavaScript Support | Complexity | Best For |
|---|---|---|---|---|
| BeautifulSoup | Fast | No | Low | Static HTML tables |
| pandas.read_html() | Very Fast | No | Very Low | Quick extraction |
| requests-html | Medium | Yes | Low | Simple JS rendering |
| Selenium | Slow | Full | High | Complex interactions |