How to Web Scrape a Table in Python

Introduction

Web scraping tables is one of the most common tasks in data extraction. Whether you're collecting financial data, sports statistics, or product information, tables are everywhere on the web. In this comprehensive guide, we'll explore multiple methods to scrape HTML tables using Python.

We'll cover everything from basic static tables to complex dynamic tables loaded via JavaScript, ensuring you have the tools to handle any table scraping scenario.

Prerequisites

Before we begin, make sure you have Python installed on your system. We'll be using several libraries throughout this tutorial:

Install via pip
pip install beautifulsoup4 requests pandas lxml selenium requests-html playwright rich

For Playwright specifically, you'll also need to install the browser binaries:

Install Playwright browsers
playwright install

TL;DR - Quick Start

Need to scrape a table fast? Here's the quickest method using Playwright we coded for you:

Quick table scraping template
#!/usr/bin/env python3
"""
Table Scraper CLI - Python Edition
A command-line version that mimics the Chrome extension behavior
"""

# Import necessary libraries for web scraping and data manipulation
from playwright.sync_api import sync_playwright, Page, Locator  # Playwright for browser automation
import pandas as pd  # For data manipulation and creating DataFrames
from typing import List, Dict, Optional, Tuple, Any  # Type hints for better code documentation
import time  # For adding delays between operations
import sys  # System-specific parameters and functions
import os  # Operating system interface
import argparse  # For parsing command-line arguments
from datetime import datetime  # For timestamp generation
from rich.console import Console  # Rich library for beautiful terminal output
from rich.table import Table as RichTable  # For displaying tables in terminal
from rich.panel import Panel  # For creating styled panels in terminal
from rich.prompt import Prompt, Confirm  # For interactive user prompts
from rich.progress import Progress, SpinnerColumn, TextColumn  # For progress indicators
import json  # For JSON data handling
import hashlib  # For generating hashes to detect duplicate content
import re  # Regular expressions for pattern matching
from pathlib import Path  # Object-oriented filesystem paths
from io import StringIO  # String buffer for in-memory file operations
from urllib.parse import urlparse  # For parsing URLs to extract domain

# Initialize Rich console for styled terminal output
console = Console()

# Define path for storing domain-specific configurations (mimics Chrome extension's localStorage)
# This file stores saved selectors and settings for each domain
DOMAIN_CONFIG_FILE = Path.home() / ".table_scraper" / "domain_configs.json"

class TableScraperCLI:
    """Main class for the Table Scraper CLI application"""

    def __init__(self):
        """Initialize the scraper with default settings"""
        self.browser = None  # Playwright browser instance
        self.page = None  # Current page being scraped
        self.tables_data = []  # List of found tables with their data and metadata
        self.current_table_index = 0  # Index of currently selected table
        self.scraped_rows = []  # Accumulated rows from multi-page scraping
        self.next_button_selector = None  # CSS selector for pagination "next" button
        self.scraped_hashes = set()  # Set of content hashes for duplicate detection
        self.enable_pattern_detection = True  # Whether to detect div/list-based tables
        self.extract_nested_data = False  # Whether to extract links/images from table cells
        self.current_domain = None  # Domain of the current URL being scraped
        self.domain_configs = self.load_domain_configs()  # Load saved configurations for domains
        
    def launch_browser(self, url: str, headless: bool = False):
        """Launch browser and navigate to URL

        Args:
            url: The URL to navigate to
            headless: Whether to run browser without GUI (True = no visible browser)
        """
        # Start Playwright and launch Chromium browser
        playwright = sync_playwright().start()
        self.browser = playwright.chromium.launch(headless=headless)
        self.page = self.browser.new_page()

        # Load any saved configuration for this domain (like saved selectors)
        domain_config = self.get_domain_config(url)
        
        # Navigate to the URL and wait for page to fully load
        with console.status("[bold green]Loading page...") as status:
            self.page.goto(url)
            self.page.wait_for_load_state("networkidle")  # Wait until network is idle

        # Check if we have a previously saved "next" button selector for this domain
        if domain_config.get('nextSelector'):
            console.print(f"[dim]Found saved config for {self.current_domain}[/dim]")
            self.next_button_selector = domain_config['nextSelector']
            
        # Inject CSS styles for visual highlighting of tables and buttons
        # These styles help users see which table is selected and which button is the "next" button
        self.page.add_style_tag(content="""
            .tablescraper-selected-table {
                border: 3px solid red !important;  /* Red border for selected table */
            }

            .tablescraper-selected-row {
                background-color: rgba(225,227,107,.54) !important;  /* Yellow highlight for selected rows */
            }

            .tablescraper-hover {
                background-color: rgba(159,238,155,.44) !important;  /* Green highlight on hover */
            }

            .tablescraper-next-button {
                background-color: green !important;  /* Green background for next button */
            }

            /* Backwards compatibility with existing code */
            .tds-highlight {
                border: 3px solid red !important;
            }
            .tds-next-button {
                background-color: green !important;
            }
        """)
        
    def find_tables(self) -> List[Dict]:
        """Find all tables on the current page with intelligent scoring

        Returns:
            List of dictionaries containing table data and metadata
        """
        # Calculate the total area of the page body to filter out tiny tables
        body_area = self.page.evaluate("""
            () => {
                const body = document.body;
                const rect = body.getBoundingClientRect();
                return rect.width * rect.height;
            }
        """)
        min_area = body_area * 0.02  # Tables must be at least 2% of page area
        
        # Find all HTML table elements on the page
        tables = self.page.locator('table').all()
        self.tables_data = []  # Reset table data list

        # Process each table found on the page
        for i, table in enumerate(tables):
            try:
                # Skip hidden tables (display:none, visibility:hidden, etc.)
                if not table.is_visible():
                    continue
                    
                # Extract table dimensions and structure information
                # This helps us score tables to find the most relevant one
                table_info = table.evaluate("""
                    element => {
                        const rect = element.getBoundingClientRect();  // Get table dimensions
                        const rows = element.querySelectorAll('tr').length;  // Count table rows
                        const cols = element.querySelector('tr') ?  // Count columns in first row
                            element.querySelector('tr').querySelectorAll('td, th').length : 0;
                        return {
                            width: rect.width,
                            height: rect.height,
                            rows: rows,
                            cols: cols,
                            area: rect.width * rect.height  // Calculate total area
                        };
                    }
                """)
                
                # Skip tables that are too small (likely navigation or layout tables)
                if table_info['area'] < min_area:
                    continue

                # Calculate relevance score: larger tables with more rows score higher
                # Formula: area * rowsΒ² gives preference to data-rich tables
                score = table_info['area'] * (table_info['rows'] ** 2)
                
                # Extract table data using appropriate method
                if self.extract_nested_data:
                    # Enhanced extraction that captures links and images within cells
                    df = self.extract_table_with_nested_data(table)
                    df_valid = not df.empty
                else:
                    # Standard extraction using pandas' built-in HTML parser
                    table_html = table.evaluate('element => element.outerHTML')
                    df_list = pd.read_html(StringIO(table_html))  # Parse HTML table
                    df = df_list[0] if df_list else pd.DataFrame()
                    df_valid = df_list and not df_list[0].empty
                
                if df_valid:
                    # Clean up duplicate or sparse columns for better data quality
                    df = self.smart_column_deduplication(df)

                    # Store table data and metadata for later use
                    self.tables_data.append({
                        'index': i,  # Original index on page
                        'rows': table_info['rows'],  # Number of rows
                        'cols': len(df.columns),  # Number of columns after cleaning
                        'score': score,  # Relevance score
                        'area': table_info['area'],  # Visual area on page
                        'df': df,  # Pandas DataFrame with actual data
                        'element': table  # Reference to DOM element
                    })
            except Exception as e:
                console.print(f"[yellow]Warning: Could not parse table {i}: {e}[/yellow]")
        
        # Sort tables by relevance score (highest first) and keep only top 5
        # This helps users focus on the most likely data tables
        self.tables_data = sorted(self.tables_data, key=lambda x: x['score'], reverse=True)[:5]
        
        return self.tables_data
    
    def find_pattern_tables(self) -> List[Dict]:
        """Find div/list-based repeating patterns that look like tables

        Many modern websites use divs with CSS instead of HTML tables.
        This method detects these patterns and treats them as tables.

        Returns:
            List of dictionaries containing pattern-based table data
        """
        pattern_tables = []

        # Common CSS selectors that often contain tabular data
        # These patterns are commonly used in modern web design
        selectors = [
            'div[class*="table"]',  # Divs with "table" in class name
            'div[class*="grid"]',  # Grid layouts
            'div[class*="list"]',  # List containers
            'div[class*="row"]:has(> div)',  # Row-based layouts
            'ul[class*="list"]',  # Unordered lists with data
            '.results > *',  # Search result containers
            '.items > *',  # Item containers
            '.products > *',  # E-commerce product lists
            '[role="table"]',  # ARIA role for accessibility
            '[role="grid"]'  # ARIA grid role
        ]
        
        # Try each selector to find pattern-based tables
        for selector in selectors:
            try:
                # Get up to 5 containers matching each selector
                containers = self.page.locator(selector).all()[:5]

                for container in containers:
                    # Skip hidden containers
                    if not container.is_visible():
                        continue

                    # Analyze container for repeating structural patterns
                    # This JavaScript code runs in the browser to detect patterns
                    pattern_info = container.evaluate("""
                        element => {
                            const children = Array.from(element.children).filter(child => 
                                !['SCRIPT', 'STYLE', 'META'].includes(child.tagName)
                            );
                            
                            if (children.length < 3) return null;
                            
                            // Count how often each CSS class appears across children
                            // This helps identify repeating patterns
                            const classCount = {};
                            children.forEach(child => {
                                Array.from(child.classList).forEach(cls => {
                                    classCount[cls] = (classCount[cls] || 0) + 1;
                                });
                            });

                            // Find classes that appear in at least 70% of children
                            // These are likely the repeating pattern classes
                            const commonClasses = Object.entries(classCount)
                                .filter(([cls, count]) => count >= children.length * 0.7)
                                .map(([cls]) => cls);
                            
                            if (commonClasses.length === 0) return null;
                            
                            const rect = element.getBoundingClientRect();
                            
                            // Estimate the number of "columns" by counting text nodes
                            // in the first child element
                            const firstChild = children[0];
                            const textNodes = [];
                            // Create a tree walker to find all text nodes
                            const walker = document.createTreeWalker(
                                firstChild,
                                NodeFilter.SHOW_TEXT,  // Only look for text nodes
                                null,
                                false
                            );
                            
                            let node;
                            while (node = walker.nextNode()) {
                                if (node.textContent.trim()) {
                                    textNodes.push(node.textContent.trim());
                                }
                            }
                            
                            return {
                                childCount: children.length,
                                commonClass: commonClasses[0],
                                area: rect.width * rect.height,
                                width: rect.width,
                                height: rect.height,
                                estimatedCols: textNodes.length
                            };
                        }
                    """)
                    
                    if pattern_info and pattern_info['childCount'] >= 3:
                        # Extract data from pattern
                        df = self.extract_pattern_data(container, pattern_info)
                        
                        if df is not None and not df.empty:
                            score = pattern_info['area'] * (pattern_info['childCount'] ** 2)
                            
                            pattern_tables.append({
                                'rows': pattern_info['childCount'],
                                'cols': pattern_info['estimatedCols'],
                                'score': score,
                                'area': pattern_info['area'],
                                'df': df,
                                'element': container,
                                'type': 'pattern',
                                'pattern_class': pattern_info['commonClass']
                            })
            except Exception as e:
                pass
        
        return pattern_tables
    
    def extract_pattern_data(self, container: Locator, pattern_info: Dict) -> Optional[pd.DataFrame]:
        """Extract data from pattern-based structure (div/list layouts)

        Args:
            container: The container element with repeating patterns
            pattern_info: Information about the detected pattern

        Returns:
            DataFrame with extracted data or None if extraction fails
        """
        try:
            # Execute JavaScript to extract data from the pattern structure
            data = container.evaluate("""
                (element, patternClass) => {
                    const rows = [];
                    const children = Array.from(element.children).filter(child => 
                        child.classList.contains(patternClass) || element.children.length < 10
                    );
                    
                    children.forEach(child => {
                        const row = {};
                        
                        // Extract all text nodes
                        const texts = [];
                        const walker = document.createTreeWalker(
                            child,
                            NodeFilter.SHOW_TEXT,
                            null,
                            false
                        );
                        
                        let node;
                        while (node = walker.nextNode()) {
                            const text = node.textContent.trim();
                            if (text && text.length > 0) {
                                texts.push(text);
                            }
                        }
                        
                        // Extract links
                        const links = Array.from(child.querySelectorAll('a')).map(a => ({
                            text: a.textContent.trim(),
                            href: a.href
                        }));
                        
                        // Build row data
                        texts.forEach((text, i) => {
                            row[`col_${i}`] = text;
                        });
                        
                        // Add link data
                        links.forEach((link, i) => {
                            if (link.text && !texts.includes(link.text)) {
                                row[`link_${i}`] = link.text;
                                row[`link_${i}_href`] = link.href;
                            }
                        });
                        
                        if (Object.keys(row).length > 0) {
                            rows.push(row);
                        }
                    });
                    
                    return rows;
                }
            """, pattern_info.get('commonClass', ''))
            
            if data and len(data) > 0:
                return pd.DataFrame(data)
        except Exception as e:
            console.print(f"[yellow]Error extracting pattern data: {e}[/yellow]")
        
        return None
    
    def extract_table_with_nested_data(self, table_element: Locator) -> pd.DataFrame:
        """Extract table data including nested elements like links and images

        This enhanced extraction captures not just text but also:
        - Links (href and text)
        - Images (src and alt text)
        - Other nested elements

        Args:
            table_element: The table DOM element to extract from

        Returns:
            DataFrame with extracted data including nested elements
        """
        # Execute JavaScript to extract comprehensive table data
        table_data = table_element.evaluate("""
            element => {
                const rows = [];
                const trs = element.querySelectorAll('tr');
                
                trs.forEach(tr => {
                    const row = {};
                    const cells = tr.querySelectorAll('td, th');
                    
                    cells.forEach((cell, index) => {
                        // Extract main text content from the cell
                        row[`col_${index}`] = cell.textContent.trim();

                        // Check for and extract links within the cell
                        const link = cell.querySelector('a');
                        if (link) {
                            row[`col_${index}_link`] = link.href;  // Link URL
                            row[`col_${index}_link_text`] = link.textContent.trim();  // Link text
                        }

                        // Check for and extract images within the cell
                        const img = cell.querySelector('img');
                        if (img) {
                            row[`col_${index}_img`] = img.src;  // Image URL
                            row[`col_${index}_img_alt`] = img.alt;  // Alt text
                        }
                    });
                    
                    if (Object.keys(row).length > 0) {
                        rows.push(row);
                    }
                });
                
                return rows;
            }
        """)
        
        if table_data:
            df = pd.DataFrame(table_data)
            # Clean up column names by removing empty link/img columns
            cols_to_keep = []
            for col in df.columns:
                if not (col.endswith('_link') or col.endswith('_img') or col.endswith('_link_text') or col.endswith('_img_alt')):
                    cols_to_keep.append(col)
                elif df[col].notna().any():  # Keep only if has data
                    cols_to_keep.append(col)
            return df[cols_to_keep]
        
        return pd.DataFrame()
    
    def clear_all_highlights(self):
        """Clear all visual highlighting from the page

        Removes all CSS classes used for highlighting tables and rows.
        This is called before applying new highlights.
        """
        self.page.evaluate("""
            () => {
                // Remove all highlighting classes from all elements
                document.querySelectorAll('*').forEach(el => {
                    el.classList.remove('tablescraper-selected-table');  // Remove table highlight
                    el.classList.remove('tablescraper-selected-row');  // Remove row highlight
                    el.classList.remove('tablescraper-hover');  // Remove hover effect
                });
            }
        """)
    
    def highlight_table(self, table_index: int):
        """Highlight a specific table on the page for visual identification

        Args:
            table_index: Index of the table in self.tables_data to highlight
        """
        # First clear any existing highlights
        self.clear_all_highlights()

        # Apply highlighting to the selected table
        if table_index < len(self.tables_data):
            table_data = self.tables_data[table_index]
            
            # Apply visual highlighting to the table element
            table_data['element'].evaluate("""
                element => {
                    // Add red border to table
                    element.classList.add('tablescraper-selected-table');
                    // Scroll table into view smoothly
                    element.scrollIntoView({behavior: 'smooth', block: 'center'});

                    // Highlight all rows within the table
                    const rows = element.querySelectorAll('tr');
                    rows.forEach(row => {
                        row.classList.add('tablescraper-selected-row');  // Yellow background
                    });
                }
            """)
            
            # Enable hover effects on table rows
            self.enable_hover_effects(table_data['element'])
    
    def enable_hover_effects(self, table_element):
        """Enable row hover effects (port of extension's d function)"""
        table_element.evaluate("""
            element => {
                // Remove any existing hover listeners
                if (window.tableHoverListeners) {
                    window.tableHoverListeners.forEach(({el, fn}) => {
                        el.removeEventListener('mouseover', fn);
                    });
                    window.tableHoverListeners = [];
                }
                
                window.tableHoverListeners = [];
                
                // Add hover effects to rows
                const rows = element.querySelectorAll('tr');
                rows.forEach(row => {
                    const hoverFn = function(e) {
                        // Remove hover from all elements
                        document.querySelectorAll('.tablescraper-hover').forEach(el => {
                            el.classList.remove('tablescraper-hover');
                        });
                        
                        // Add hover to current row
                        this.classList.add('tablescraper-hover');
                    };
                    
                    const leaveFn = function(e) {
                        this.classList.remove('tablescraper-hover');
                    };
                    
                    row.addEventListener('mouseover', hoverFn);
                    row.addEventListener('mouseleave', leaveFn);
                    
                    window.tableHoverListeners.push({el: row, fn: hoverFn});
                    window.tableHoverListeners.push({el: row, fn: leaveFn});
                });
            }
        """)
    
    def calculate_content_hash(self, df: pd.DataFrame) -> str:
        """Calculate SHA256 hash of dataframe content for duplicate detection

        Creates a unique fingerprint of the data to detect when we've
        scraped the same content (useful for pagination).

        Args:
            df: DataFrame to hash

        Returns:
            SHA256 hash string of the data content
        """
        # Convert DataFrame to JSON format for hashing
        content = df.to_json(orient='records')
        # Sort the JSON to ensure consistent hashing regardless of column order
        import json
        data = json.loads(content)
        sorted_content = json.dumps(data, sort_keys=True)
        # Generate SHA256 hash of the sorted content
        return hashlib.sha256(sorted_content.encode()).hexdigest()
    
    def check_duplicate(self, df: pd.DataFrame) -> bool:
        """Check if this data has already been scraped

        Used during pagination to detect when we've reached the end
        or are seeing repeated content.

        Args:
            df: DataFrame to check

        Returns:
            True if this exact data has been seen before, False otherwise
        """
        content_hash = self.calculate_content_hash(df)
        # Check if we've seen this exact content before
        if content_hash in self.scraped_hashes:
            return True  # Duplicate found
        # Add to set of seen content
        self.scraped_hashes.add(content_hash)
        return False  # New content
    
    def smart_column_deduplication(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply smart column deduplication to clean up messy tables

        This method:
        1. Removes columns where all values are identical (no information)
        2. Merges duplicate columns that don't overlap
        3. Removes sparse columns with too little data

        Args:
            df: DataFrame to clean

        Returns:
            Cleaned DataFrame with deduplicated columns
        """
        if df.empty:
            return df

        # Step 1: Remove columns where all values are identical (no useful information)
        cols_to_keep = []
        for col in df.columns:
            unique_values = df[col].dropna().unique()
            # Keep column if it has varying values or is completely empty
            if len(unique_values) > 1 or len(unique_values) == 0:
                cols_to_keep.append(col)
        
        df = df[cols_to_keep]
        
        # Step 2: Handle duplicate column names (often from poorly formatted HTML)
        column_counts = {}  # Track column name occurrences
        new_columns = []  # Build new column names

        for col in df.columns:
            if col in column_counts:
                # Check if duplicate columns can be merged (no overlapping data)
                can_merge = True
                for idx in df.index:
                    # If both columns have data at same row, can't merge
                    if pd.notna(df.loc[idx, col]) and pd.notna(df.iloc[idx, column_counts[col]]):
                        can_merge = False
                        break
                
                if can_merge:
                    # Merge columns - keep existing column index
                    for idx in df.index:
                        if pd.notna(df.loc[idx, col]):
                            df.iloc[idx, column_counts[col]] = df.loc[idx, col]
                    # Skip this duplicate column
                    continue
                else:
                    # Add numeric suffix
                    suffix_num = 2
                    new_col_name = f"{col} {suffix_num}"
                    while new_col_name in new_columns:
                        suffix_num += 1
                        new_col_name = f"{col} {suffix_num}"
                    new_columns.append(new_col_name)
            else:
                column_counts[col] = len(new_columns)
                new_columns.append(col)
        
        # Apply new column names
        df = df.iloc[:, [i for i, col in enumerate(df.columns) if i < len(new_columns)]]
        df.columns = new_columns[:len(df.columns)]
        
        # Step 3: Remove sparse columns (less than 20% data)
        # These are often formatting artifacts or empty columns
        min_data_ratio = 0.2  # Require at least 20% non-null values
        cols_to_keep = []
        for col in df.columns:
            # Calculate ratio of non-null values
            non_null_ratio = df[col].notna().sum() / len(df)
            if non_null_ratio >= min_data_ratio:
                cols_to_keep.append(col)
        
        return df[cols_to_keep] if cols_to_keep else df
    
    def infer_data_types(self, df: pd.DataFrame) -> pd.DataFrame:
        """Infer and convert data types for better Excel/CSV export

        Automatically detects:
        - Numbers (including currency and percentages)
        - Dates and timestamps
        - Boolean values

        Args:
            df: DataFrame with string data

        Returns:
            DataFrame with properly typed columns
        """
        df_typed = df.copy()  # Work on a copy to avoid modifying original
        
        for col in df_typed.columns:
            # Skip columns that are already non-string
            if df_typed[col].dtype != 'object':
                continue
                
            # Try to convert to numeric
            try:
                # Remove common number formatting characters (commas, dollar signs)
                cleaned = df_typed[col].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
                
                # Check if column contains percentage values
                if cleaned.str.endswith('%').any():
                    cleaned = cleaned.str.rstrip('%')  # Remove % sign
                    numeric_vals = pd.to_numeric(cleaned, errors='coerce')
                    # Convert if at least 50% of values are valid percentages
                    if numeric_vals.notna().sum() > len(cleaned) * 0.5:
                        df_typed[col] = numeric_vals / 100  # Convert to decimal (50% -> 0.5)
                        continue
                
                # Try regular numeric conversion
                numeric_vals = pd.to_numeric(cleaned, errors='coerce')
                if numeric_vals.notna().sum() > len(cleaned) * 0.8:  # At least 80% valid
                    df_typed[col] = numeric_vals
                    continue
            except:
                pass
            
            # Try to convert to datetime
            try:
                # Only attempt if values look like dates (check first 10 values)
                sample = df_typed[col].dropna().astype(str).head(10)
                # Common date patterns to check for
                date_patterns = [r'\d{4}-\d{2}-\d{2}',  # YYYY-MM-DD
                               r'\d{1,2}/\d{1,2}/\d{2,4}',  # MM/DD/YYYY or M/D/YY
                               r'\d{1,2}-\d{1,2}-\d{2,4}',  # MM-DD-YYYY
                               r'\w+ \d{1,2}, \d{4}']  # Month DD, YYYY
                
                if any(sample.str.match(pattern).any() for pattern in date_patterns):
                    date_vals = pd.to_datetime(df_typed[col], errors='coerce')
                    if date_vals.notna().sum() > len(df_typed[col]) * 0.5:
                        df_typed[col] = date_vals
                        continue
            except:
                pass
            
            # Check for boolean values
            try:
                lower_vals = df_typed[col].astype(str).str.lower().str.strip()
                # Check if all values are boolean-like
                if set(lower_vals.dropna().unique()) <= {'true', 'false', 'yes', 'no', '1', '0', ''}:
                    # Map string values to actual booleans
                    bool_map = {'true': True, 'false': False,
                               'yes': True, 'no': False,
                               '1': True, '0': False,
                               '': None}  # Empty string becomes None
                    df_typed[col] = lower_vals.map(bool_map)
                    continue
            except:
                pass
        
        return df_typed
    
    def generate_smart_selector(self, element_js: str) -> str:
        """Generate robust CSS selector avoiding dynamic IDs

        Creates a CSS selector that will reliably find the element even
        if the page structure changes slightly. Avoids IDs with numbers
        which are often dynamically generated.

        Args:
            element_js: JavaScript expression to get the element

        Returns:
            CSS selector string for the element
        """
        selector = self.page.evaluate(f"""
            (function() {{
                const element = {element_js};  // Get the target element
                if (!element) return '';
                
                // Helper function to escape CSS special characters
                // This ensures selectors work even with special characters in classes/IDs
                function escapeCSS(str, prefix) {{
                    const escaped = (prefix || '.') +  // Use # for IDs, . for classes
                        str.replace(/[!"#$%&'()*+,.\\/:;<=>?@[\\\\\\]^`{{|}}~]/g, '\\\\$&').trim();
                    return escaped;
                }}
                
                // Build selector from element to root, creating a path
                const parts = [];
                let current = element;

                // Walk up the DOM tree from element to body
                while (current && current.tagName &&
                       current.tagName.toLowerCase() !== 'html' &&
                       current.tagName.toLowerCase() !== 'body') {{
                    
                    let selector = current.tagName.toLowerCase();
                    
                    // Use ID if it exists and doesn't contain numbers
                    // (numbers often indicate dynamic IDs that change)
                    if (current.id && current.id.trim() && !current.id.match(/\\d+/)) {{
                        selector += escapeCSS(current.id, '#');
                    }}
                    // Otherwise use CSS classes
                    else if (current.className && current.className.trim()) {{
                        // Handle className that might be an object (for SVG elements)
                        const classStr = typeof current.className === 'string' ?
                            current.className : current.className.baseVal || '';
                        if (classStr) {{
                            // Join multiple classes with dots
                            selector += escapeCSS(classStr).replace(/\\s+/g, '.');
                        }}
                    }}
                    
                    parts.unshift(selector);
                    current = current.parentElement;
                }}
                
                return parts.join(' > ');
            }})();
        """)
        
        return selector
    
    def load_domain_configs(self) -> Dict[str, Dict]:
        """Load domain-specific configurations from disk

        Loads saved configurations like next button selectors for each domain,
        mimicking the Chrome extension's localStorage.

        Returns:
            Dictionary of domain configurations
        """
        if DOMAIN_CONFIG_FILE.exists():
            try:
                with open(DOMAIN_CONFIG_FILE, 'r') as f:
                    return json.load(f)
            except:
                # Return empty dict if file is corrupted
                return {}
        return {}  # No config file exists yet
    
    def save_domain_configs(self):
        """Save domain configurations to disk

        Persists configurations so they can be reused in future sessions.
        """
        # Create directory if it doesn't exist
        DOMAIN_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
        # Write configurations as formatted JSON
        with open(DOMAIN_CONFIG_FILE, 'w') as f:
            json.dump(self.domain_configs, f, indent=2)
    
    def get_domain_config(self, url: str) -> Dict[str, Any]:
        """Get configuration for a specific domain

        Args:
            url: URL to extract domain from

        Returns:
            Configuration dictionary for the domain
        """
        # Extract domain from URL
        parsed = urlparse(url)
        hostname = parsed.hostname or parsed.netloc
        self.current_domain = hostname

        # Create default config if domain is new
        if hostname not in self.domain_configs:
            self.domain_configs[hostname] = {
                'nextSelector': None,  # CSS selector for "next" button
                'tableSelector': None,  # CSS selector for table
                'crawlDelay': 1000,  # Delay between pages in milliseconds
                'maxWait': 20000,  # Maximum wait time for page load
                'deletedFields': [],  # Columns to exclude
                'customHeaders': {},  # Custom HTTP headers
                'lastUsed': datetime.now().isoformat()  # Last access time
            }
        
        return self.domain_configs[hostname]
    
    def save_next_selector(self, selector: str):
        """Save next button selector for current domain

        Saves the selector so it can be reused next time this domain is scraped.

        Args:
            selector: CSS selector for the "next" button
        """
        if self.current_domain:
            # Update configuration
            self.domain_configs[self.current_domain]['nextSelector'] = selector
            self.domain_configs[self.current_domain]['lastUsed'] = datetime.now().isoformat()
            # Persist to disk
            self.save_domain_configs()
            console.print(f"[green]βœ“ Saved selector for {self.current_domain}[/green]")
    
    def save_table_selector(self, selector: str):
        """Save table selector for current domain"""
        if self.current_domain:
            self.domain_configs[self.current_domain]['tableSelector'] = selector
            self.save_domain_configs()
    
    def save_deleted_fields(self, fields: List[str]):
        """Save deleted column names for current domain"""
        if self.current_domain:
            self.domain_configs[self.current_domain]['deletedFields'] = fields
            self.save_domain_configs()
    
    async def find_scrollable_parent(self, element_selector: str) -> Optional[str]:
        """Find the scrollable parent element (port of extension's scroll detection)"""
        scrollable_parent = self.page.evaluate(f"""
            async () => {{
                // Helper to check if element can scroll
                const canScroll = async (element) => {{
                    const originalTop = element.scrollTop;
                    element.scrollTop = element.scrollTop + 50;
                    await new Promise(resolve => setTimeout(resolve, 10));
                    const scrolled = element.scrollTop > originalTop;
                    element.scrollTop = originalTop;
                    return scrolled;
                }};
                
                let element = document.querySelector('{element_selector}');
                
                // Walk up the DOM tree to find scrollable parent
                while (element && element !== document.body) {{
                    if (await canScroll(element)) {{
                        // Generate selector for this element
                        return element.id ? '#' + element.id : 
                               element.className ? '.' + element.className.split(' ')[0] : 
                               element.tagName.toLowerCase();
                    }}
                    element = element.parentElement;
                }}
                
                // Check body as last resort
                if (await canScroll(document.body)) {{
                    return 'body';
                }}
                
                return null;
            }}
        """)
        
        return scrollable_parent
    
    def lazy_load_scroll(self, container_selector: str, target_selector: str, max_time: int = 30):
        """Implement lazy loading by scrolling (port of extension's N function)"""
        console.print(f"[bold yellow]Performing lazy load scrolling...[/bold]")
        
        result = self.page.evaluate(f"""
            async () => {{
                const container = document.querySelector('{container_selector}');
                if (!container) return {{ error: 'Container not found' }};
                
                const targetElements = () => document.querySelectorAll('{target_selector}');
                const initialCount = targetElements().length;
                let previousCount = initialCount;
                let previousScrollTop = container.scrollTop;
                let noChangeCount = 0;
                const startTime = Date.now();
                const maxTime = {max_time} * 1000;
                
                console.log('Starting lazy load scroll. Initial elements:', initialCount);
                
                while (Date.now() - startTime < maxTime) {{
                    // Scroll down by 1000 pixels
                    container.scrollTop += 1000;
                    
                    // Wait for content to load
                    await new Promise(resolve => setTimeout(resolve, 1000));
                    
                    const currentCount = targetElements().length;
                    const currentScrollTop = container.scrollTop;
                    
                    // Check if new content loaded
                    if (currentCount > previousCount) {{
                        console.log('New elements loaded:', currentCount - previousCount);
                        previousCount = currentCount;
                        noChangeCount = 0;
                    }} else if (currentScrollTop === previousScrollTop) {{
                        // We've reached the bottom
                        noChangeCount++;
                        if (noChangeCount >= 3) {{
                            console.log('Reached bottom of scrollable area');
                            break;
                        }}
                    }}
                    
                    previousScrollTop = currentScrollTop;
                }}
                
                const finalCount = targetElements().length;
                return {{
                    success: true,
                    initialCount: initialCount,
                    finalCount: finalCount,
                    newElements: finalCount - initialCount
                }};
            }}
        """)
        
        if result.get('error'):
            console.print(f"[red]Error: {result['error']}[/red]")
        else:
            console.print(f"[green]βœ“ Loaded {result['newElements']} new elements[/green]")
        
        return result
    
    def wait_for_network_idle(self, max_wait: int = 20000, crawl_delay: int = 1000) -> bool:
        """Wait for network requests to complete

        Ensures the page is fully loaded before scraping, especially important
        for pages that load data via AJAX.

        Args:
            max_wait: Maximum time to wait in milliseconds
            crawl_delay: Additional delay after network idle in milliseconds

        Returns:
            True if network became idle, False if timeout
        """
        console.print("[dim]Waiting for network activity to complete...[/dim]")

        try:
            # Wait for network to be idle (no requests for 500ms)
            self.page.wait_for_load_state("networkidle", timeout=max_wait)

            # Additional wait to ensure dynamic content is rendered
            time.sleep(crawl_delay / 1000)

            return True
        except Exception as e:
            console.print(f"[yellow]Network wait timeout: {e}[/yellow]")
            return False
    
    def smart_page_wait(self, action_fn=None, max_wait: int = 20000, crawl_delay: int = 1000):
        """Smart page waiting with network monitoring

        Intelligently waits for page to be ready after an action (like clicking next).
        Combines DOM readiness, network idle, and safety delays.

        Args:
            action_fn: Optional function to execute (e.g., click next button)
            max_wait: Maximum wait time in milliseconds
            crawl_delay: Additional safety delay in milliseconds

        Returns:
            True if page settled successfully
        """
        # Step 1: Execute action if provided (e.g., click next button)
        if action_fn:
            action_fn()

        # Step 2: Wait for DOM to be ready
        try:
            self.page.wait_for_load_state("domcontentloaded", timeout=max_wait // 4)
        except:
            pass  # Continue even if timeout

        # Step 3: Wait for network requests to complete
        network_settled = self.wait_for_network_idle(max_wait, crawl_delay)

        # Step 4: Additional safety wait for JavaScript rendering
        time.sleep(crawl_delay / 1000)

        return network_settled
    
    def row_by_row_scroll(self, table_selector: str):
        """Scroll through table rows one by one (port of extension's E function)"""
        self.page.evaluate(f"""
            () => {{
                const table = document.querySelector('{table_selector}');
                if (!table) return;
                
                const rows = table.querySelectorAll('tr');
                let delay = 50; // Default 50ms between rows
                
                // Adjust delay based on row count (max 3 seconds total)
                if (rows.length * delay > 3000) {{
                    delay = Math.max(10, 3000 / rows.length);
                }}
                
                console.log('Scrolling through', rows.length, 'rows with', delay, 'ms delay');
                
                let index = 0;
                const scrollInterval = setInterval(() => {{
                    if (index >= rows.length || index * delay > 10000) {{
                        clearInterval(scrollInterval);
                        return;
                    }}
                    
                    rows[index].scrollIntoView({{ behavior: 'smooth', block: 'center' }});
                    index++;
                }}, delay);
            }}
        """)
    
    def display_table_preview(self, df: pd.DataFrame, max_rows: int = 10):
        """Display a preview of the dataframe in the terminal

        Args:
            df: DataFrame to display
            max_rows: Maximum number of rows to show
        """
        # Create a Rich table for beautiful terminal output
        rich_table = RichTable(title="Table Preview", show_lines=True)
        
        # Add column headers
        for col in df.columns:
            rich_table.add_column(str(col), style="cyan", no_wrap=False)

        # Add data rows (limited to max_rows)
        for _, row in df.head(max_rows).iterrows():
            # Convert all values to strings for display
            rich_table.add_row(*[str(val) for val in row.values])
        
        console.print(rich_table)
        
        if len(df) > max_rows:
            console.print(f"[dim]... and {len(df) - max_rows} more rows[/dim]")
    
    def select_table(self):
        """Let user select which table to scrape

        In interactive mode, allows cycling through found tables.
        In non-interactive mode, auto-selects the highest scoring table.

        Returns:
            Index of selected table or None if cancelled
        """
        if not self.tables_data:
            console.print("[red]No tables found on this page![/red]")
            return None

        # In non-interactive mode (piped input), auto-select the best table
        if not sys.stdin.isatty():
            current = self.tables_data[0]  # Highest scoring table
            # Show what type of table was found
            table_type = "πŸ“Š HTML Table" if current.get('type') != 'pattern' else "πŸ“‹ Pattern-based"
            console.print(f"\n[bold]Auto-selected: {table_type}[/bold]")
            console.print(f"Rows: {current['rows']}, Columns: {current['cols']}")
            if 'score' in current:
                console.print(f"Score: {current['score']:,.0f}")
            # Highlight the table on the page
            self.highlight_table(0)
            # Show preview of the data
            self.display_table_preview(current['df'])
            return 0
            
        # Interactive mode - let user cycle through tables
        while True:
            # Get current table being previewed
            current = self.tables_data[self.current_table_index]

            # Display table information
            table_type = "πŸ“Š HTML Table" if current.get('type') != 'pattern' else "πŸ“‹ Pattern-based"
            console.print(f"\n[bold]{table_type} {self.current_table_index + 1} of {len(self.tables_data)}[/bold]")
            console.print(f"Rows: {current['rows']}, Columns: {current['cols']}")
            if 'score' in current:
                console.print(f"Score: {current['score']:,.0f} (larger tables with more data score higher)")
            
            self.highlight_table(self.current_table_index)
            self.display_table_preview(current['df'])
            
            # Ask user what to do
            console.print("\n[bold]Options:[/bold]")
            console.print("1. Use this table")
            console.print("2. Try another table")
            console.print("3. Cancel")
            
            choice = Prompt.ask("Select", choices=["1", "2", "3"])
            
            if choice == "1":
                return self.current_table_index
            elif choice == "2":
                self.current_table_index = (self.current_table_index + 1) % len(self.tables_data)
            else:
                return None
    
    def locate_next_button(self):
        """Interactive next button selection

        Allows user to click on the "next" button in the browser,
        which will be captured and saved for pagination.

        Returns:
            True if button was successfully located, False otherwise
        """
        console.print("\n[bold yellow]Click on the 'Next' button or pagination link in the browser[/bold]")
        console.print("[dim]The button will be highlighted in green when clicked[/dim]")
        
        # Inject JavaScript to intercept clicks and identify the next button
        self.page.evaluate("""
            window.tdsNextButton = null;  // Store the clicked element
            window.tdsClickHandler = function(e) {
                e.preventDefault();  // Prevent default action
                e.stopPropagation();  // Stop event bubbling
                
                // Remove any previous highlight
                document.querySelectorAll('.tds-next-button').forEach(el => {
                    el.classList.remove('tds-next-button');
                });

                // Highlight the clicked element in green
                e.target.classList.add('tds-next-button');
                window.tdsNextButton = e.target;  // Store reference
                
                // Generate a basic selector for the clicked element
                let selector = '';
                if (e.target.id) {
                    selector = '#' + e.target.id;  // Use ID if available
                } else if (e.target.className) {
                    // Use class names (excluding our highlight class)
                    selector = '.' + e.target.className.split(' ').filter(c => c !== 'tds-next-button').join('.');
                } else {
                    // Fall back to tag name and text content
                    selector = e.target.tagName.toLowerCase();
                    if (e.target.textContent) {
                        selector += ':has-text("' + e.target.textContent.trim() + '")';
                    }
                }
                window.tdsNextSelector = selector;  // Store basic selector
                window.tdsNextElement = e.target;  // Store element reference
                
                return false;
            };
            document.addEventListener('click', window.tdsClickHandler, true);
        """)
        
        # Wait for user to click on the next button (30 second timeout)
        start_time = time.time()
        while time.time() - start_time < 30:
            # Check if user has clicked something
            next_button = self.page.evaluate("window.tdsNextButton")
            if next_button:
                # Generate a robust selector that will work across pages
                smart_selector = self.generate_smart_selector("window.tdsNextElement")
                if smart_selector:
                    self.next_button_selector = smart_selector
                else:
                    # Fall back to the basic selector if smart generation fails
                    self.next_button_selector = self.page.evaluate("window.tdsNextSelector")
                
                # Remove event listener
                self.page.evaluate("""
                    document.removeEventListener('click', window.tdsClickHandler, true);
                """)
                
                console.print(f"\n[green]βœ“ Next button located: {self.next_button_selector}[/green]")
                
                # Save selector for future use
                self.save_next_selector(self.next_button_selector)
                
                return True
                
            time.sleep(0.1)
        
        # Timeout
        self.page.evaluate("""
            document.removeEventListener('click', window.tdsClickHandler, true);
        """)
        console.print("[red]Timeout waiting for next button selection[/red]")
        return False
    
    def crawl_pages(self, max_pages: int = 10, delay: int = 1):
        """Crawl multiple pages using pagination

        Args:
            max_pages: Maximum number of pages to crawl
            delay: Delay between pages in seconds

        Returns:
            Tuple of (combined DataFrame, number of pages scraped)
        """
        pages_scraped = 1  # Start with current page

        # Get initial data from selected table
        df = self.tables_data[self.current_table_index]['df']
        all_data = df.to_dict('records')  # Convert to list of dicts
        
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console
        ) as progress:
            task = progress.add_task("Crawling pages...", total=max_pages)
            
            while pages_scraped < max_pages:
                try:
                    # Get domain-specific timing configuration
                    domain_config = self.domain_configs.get(self.current_domain, {})
                    max_wait = domain_config.get('maxWait', 20000)  # Max wait time
                    crawl_delay = domain_config.get('crawlDelay', delay * 1000)  # Delay between pages

                    # Define function to click the next button
                    def click_next():
                        self.page.click(self.next_button_selector)

                    # Click next and wait intelligently for page to load
                    self.smart_page_wait(click_next, max_wait, crawl_delay)
                    
                    # Find tables on the newly loaded page
                    tables = self.page.locator('table').all()
                    if not tables:
                        console.print("[yellow]No tables found on new page[/yellow]")
                        break

                    # Try to find the same table position, or use first available
                    table_index = min(self.current_table_index, len(tables) - 1)
                    # Extract HTML of the target table
                    table_html = tables[table_index].evaluate('element => element.outerHTML')
                    
                    # Parse the table HTML into a DataFrame
                    df_list = pd.read_html(StringIO(table_html))
                    if df_list and not df_list[0].empty:
                        # Check if this is duplicate content (end of pagination)
                        if self.check_duplicate(df_list[0]):
                            console.print("[yellow]⚠️  Duplicate content detected, stopping crawl[/yellow]")
                            break

                        # Add new rows to our collection
                        new_rows = df_list[0].to_dict('records')
                        all_data.extend(new_rows)
                        pages_scraped += 1
                        
                        progress.update(task, advance=1, 
                                      description=f"Scraped {pages_scraped} pages, {len(all_data)} rows")
                    else:
                        console.print("[yellow]No data found on new page[/yellow]")
                        break
                        
                except Exception as e:
                    console.print(f"[red]Error: {e}[/red]")
                    break
        
        return pd.DataFrame(all_data), pages_scraped
    
    def save_data(self, df: pd.DataFrame):
        """Save scraped data with intelligent type preservation

        Offers multiple export formats and can automatically detect
        and preserve data types.

        Args:
            df: DataFrame to save
        """
        console.print("\n[bold]Export Options:[/bold]")
        console.print("1. CSV")  # Standard comma-separated values
        console.print("2. Excel (XLSX) - with data types preserved")  # Excel with formatting
        console.print("3. JSON")  # JavaScript Object Notation
        console.print("4. Copy to clipboard")  # Direct copy for pasting
        
        choice = Prompt.ask("Select format", choices=["1", "2", "3", "4"])
        
        # Ask if user wants automatic type detection
        preserve_types = Confirm.ask("Detect and preserve data types (numbers, dates, etc)?", default=True)
        if preserve_types:
            console.print("[dim]Detecting data types...[/dim]")
            # Apply intelligent type inference
            df_typed = self.infer_data_types(df)
            
            # Show user what data types were detected
            type_changes = []
            for col in df.columns:
                # Check if column type changed
                if df[col].dtype != df_typed[col].dtype:
                    type_changes.append(f"{col}: {df[col].dtype} β†’ {df_typed[col].dtype}")
            
            if type_changes:
                console.print("[green]Detected types:[/green]")
                for change in type_changes[:5]:  # Show first 5
                    console.print(f"  {change}")
                if len(type_changes) > 5:
                    console.print(f"  ... and {len(type_changes) - 5} more")
            
            df = df_typed
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        if choice == "1":
            # Export as CSV
            filename = f"scraped_data_{timestamp}.csv"
            df.to_csv(filename, index=False)  # Don't include row indices
            console.print(f"[green]βœ“ Saved to {filename}[/green]")
            
        elif choice == "2":
            # Export as Excel with formatting
            filename = f"scraped_data_{timestamp}.xlsx"

            # Use Excel writer with openpyxl engine for formatting support
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                df.to_excel(writer, index=False, sheet_name='Data')

                # Auto-adjust column widths for better readability
                worksheet = writer.sheets['Data']
                for idx, col in enumerate(df.columns):
                    # Calculate optimal width based on content
                    max_length = max(
                        df[col].astype(str).map(len).max(),  # Max data length
                        len(str(col))  # Column header length
                    ) + 2  # Add padding
                    # Set width (max 50 to prevent too wide columns)
                    worksheet.column_dimensions[chr(65 + idx)].width = min(max_length, 50)
            
            console.print(f"[green]βœ“ Saved to {filename} with formatting[/green]")
            
        elif choice == "3":
            filename = f"scraped_data_{timestamp}.json"
            # JSON with date handling
            df.to_json(filename, orient='records', indent=2, date_format='iso')
            console.print(f"[green]βœ“ Saved to {filename}[/green]")
            
        elif choice == "4":
            df.to_clipboard(index=False, sep='\t')
            console.print("[green]βœ“ Copied to clipboard[/green]")
    
    def run(self, url: str, headless_override: Optional[bool] = None):
        """Main execution flow for interactive scraping

        Args:
            url: URL to scrape
            headless_override: Force headless mode if specified
        """
        try:
            # Determine browser mode (headless or visible)
            if headless_override is not None:
                headless = headless_override
                console.print(f"[dim]Running in {'headless' if headless else 'headed'} mode (from command line)[/dim]")
            else:
                # Ask user preference
                headless = Confirm.ask("Run browser in headless mode?", default=True)
            # Launch browser and navigate to URL
            self.launch_browser(url, headless)
            
            # Ask about advanced features (only in interactive mode)
            if sys.stdin.isatty():
                self.extract_nested_data = Confirm.ask("Extract links and images from table cells?", default=False)
            else:
                # Use command line flags or defaults in non-interactive mode
                console.print(f"[dim]Using default settings (extract nested: {self.extract_nested_data})[/dim]")
            
            # Find tables
            console.print("\n[bold]Searching for tables...[/bold]")
            tables = self.find_tables()
            
            # Also find pattern-based tables if enabled
            if self.enable_pattern_detection:
                console.print("[dim]Also checking for div/list-based tables...[/dim]")
                pattern_tables = self.find_pattern_tables()
                if pattern_tables:
                    tables.extend(pattern_tables)
                    # Re-sort all by score
                    self.tables_data = sorted(tables, key=lambda x: x['score'], reverse=True)[:5]
            
            if not self.tables_data:
                console.print("[red]No tables found on this page![/red]")
                return
                
            console.print(f"[green]Found {len(self.tables_data)} table candidate(s)[/green]")
            
            # Show what types were found
            html_tables = sum(1 for t in self.tables_data if t.get('type') != 'pattern')
            pattern_tables = sum(1 for t in self.tables_data if t.get('type') == 'pattern')
            if pattern_tables > 0:
                console.print(f"[dim]({html_tables} HTML tables, {pattern_tables} pattern-based)[/dim]")
            
            # Select table
            table_index = self.select_table()
            if table_index is None:
                return
            
            # Ask if user wants to scrape multiple pages
            if Confirm.ask("\nDo you want to crawl multiple pages?", default=False):
                # Determine crawling method: infinite scroll or pagination
                use_infinite_scroll = Confirm.ask("Use infinite scroll instead of pagination?", default=False)
                
                if use_infinite_scroll:
                    # Infinite scroll mode - for pages that load more data when scrolling
                    # Generate selector for the current table
                    table_selector = self.generate_smart_selector("arguments[0]")
                    # Store selector in browser for later use
                    self.page.evaluate("(el, sel) => window._tableSelector = sel",
                                     self.tables_data[table_index]['element'], table_selector)
                    
                    # Try to find the scrollable container element
                    scrollable = self.page.evaluate("""
                        async () => {
                            const table = document.querySelector(window._tableSelector);
                            if (!table) return null;

                            // Common CSS patterns for scrollable containers
                            const patterns = [
                                '.results-container', '.scroll-container', '.data-container',
                                '[class*="scroll"]', '[class*="results"]', 'main', 'body'
                            ];
                            
                            for (const pattern of patterns) {
                                const container = table.closest(pattern);
                                if (container) return pattern;
                            }
                            
                            return 'body';
                        }
                    """)
                    
                    if scrollable:
                        # Perform lazy loading
                        self.lazy_load_scroll(scrollable, table_selector)
                        
                        # Re-find tables after scrolling
                        self.find_tables()
                        if self.tables_data:
                            df = self.tables_data[0]['df']  # Use the first/largest table
                        else:
                            df = self.tables_data[table_index]['df']
                    else:
                        console.print("[yellow]Could not find scrollable container[/yellow]")
                        df = self.tables_data[table_index]['df']
                else:
                    # Traditional pagination mode - clicking "next" button
                    need_to_locate = True  # Flag to determine if we need to find the button

                    # Check if we have a previously saved "next" button selector for this domain
                    if self.next_button_selector:
                        use_saved = Confirm.ask(f"Use saved 'Next' selector for {self.current_domain}?", default=True)
                        if use_saved:
                            need_to_locate = False  # Don't need to locate, use saved
                            console.print(f"[green]Using saved selector: {self.next_button_selector}[/green]")
                    
                    if need_to_locate and self.locate_next_button():
                        max_pages = int(Prompt.ask("Maximum pages to crawl", default="10"))
                        delay = int(Prompt.ask("Delay between pages (seconds)", default="1"))
                    elif not need_to_locate and self.next_button_selector:
                        max_pages = int(Prompt.ask("Maximum pages to crawl", default="10"))
                        delay = int(Prompt.ask("Delay between pages (seconds)", default="1"))
                        
                        df, pages = self.crawl_pages(max_pages, delay)
                        console.print(f"\n[green]βœ“ Crawled {pages} pages, collected {len(df)} rows[/green]")
                    else:
                        df = self.tables_data[table_index]['df']
            else:
                df = self.tables_data[table_index]['df']
            
            # Display final data
            console.print("\n[bold]Final Data Preview:[/bold]")
            self.display_table_preview(df, max_rows=20)
            
            # Save data
            if Confirm.ask("\nSave the data?", default=True):
                self.save_data(df)
                
        except KeyboardInterrupt:
            console.print("\n[yellow]Interrupted by user[/yellow]")
        except Exception as e:
            console.print(f"\n[red]Error: {e}[/red]")
        finally:
            if self.browser:
                self.browser.close()
    
    def run_automated(self, url: str, headless: bool = True):
        """Automated run for command line usage with minimal interaction

        This method is designed for non-interactive use cases where
        the scraper automatically selects the best table and saves it.

        Args:
            url: URL to scrape
            headless: Whether to run browser in headless mode
        """
        try:
            # Launch browser in specified mode
            self.launch_browser(url, headless)
            
            # Find tables with intelligent detection
            console.print("\n[bold]Searching for tables...[/bold]")
            tables = self.find_tables()
            
            # Also find pattern-based tables if enabled
            if self.enable_pattern_detection:
                console.print("[dim]Also checking for div/list-based tables...[/dim]")
                pattern_tables = self.find_pattern_tables()
                if pattern_tables:
                    tables.extend(pattern_tables)
                    # Re-sort all by score
                    self.tables_data = sorted(tables, key=lambda x: x['score'], reverse=True)[:5]
            
            if not self.tables_data:
                console.print("[red]No tables found on this page![/red]")
                return
                
            console.print(f"[green]Found {len(self.tables_data)} table candidate(s)[/green]")
            
            # Show what types were found
            html_tables = sum(1 for t in self.tables_data if t.get('type') != 'pattern')
            pattern_tables = sum(1 for t in self.tables_data if t.get('type') == 'pattern')
            if pattern_tables > 0:
                console.print(f"[dim]({html_tables} HTML tables, {pattern_tables} pattern-based)[/dim]")
            
            # Auto-select the highest scoring table
            table_index = 0
            candidate = self.tables_data[table_index]
            
            table_type = "πŸ“Š HTML Table" if candidate.get('type') != 'pattern' else "πŸ“‹ Pattern-based"
            console.print(f"\n[bold]Auto-selected: {table_type}[/bold]")
            console.print(f"Rows: {candidate['rows']}, Columns: {candidate['cols']}")
            if 'score' in candidate:
                console.print(f"Score: {candidate['score']:,.0f}")
            
            self.highlight_table(table_index)
            
            # Extract data
            df = candidate['df']
            
            # Display preview
            console.print("\n[bold]Data Preview:[/bold]")
            self.display_table_preview(df, max_rows=10)
            
            # Auto-save as CSV
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"scraped_data_{timestamp}.csv"
            df.to_csv(filename, index=False)
            console.print(f"\n[green]βœ“ Auto-saved to {filename}[/green]")
            console.print(f"[dim]Scraped {len(df)} rows Γ— {len(df.columns)} columns[/dim]")
                
        except KeyboardInterrupt:
            console.print("\n[yellow]Interrupted by user[/yellow]")
        except Exception as e:
            console.print(f"\n[red]Error: {e}[/red]")
        finally:
            if self.browser:
                self.browser.close()

def main():
    """Main entry point for the Table Scraper CLI"""
    # Set up command line argument parser
    parser = argparse.ArgumentParser(description='TableScraper - Python Edition')
    parser.add_argument('url', nargs='?', help='URL to scrape')  # Optional URL argument
    parser.add_argument('--headless', action='store_true', help='Run browser in headless mode')  # Force headless
    parser.add_argument('--extract-nested', action='store_true', help='Extract links and images from cells')  # Enhanced extraction
    parser.add_argument('--no-patterns', action='store_true', help='Disable pattern detection')  # HTML tables only
    args = parser.parse_args()
    
    # Display welcome banner
    console.print(Panel.fit(
        "[bold blue]TableScraper - Python Edition[/bold blue]\n"
        "A Playwright-powered table scraper",
        border_style="blue"
    ))
    
    # Get URL from command line or prompt user
    if args.url:
        # URL provided as command line argument
        url = args.url
        console.print(f"\n[green]Using URL from command line: {url}[/green]")
    else:
        # No URL provided, need to get it
        if sys.stdin.isatty():  # Check if running interactively
            # Prompt user for URL with a default example
            url = Prompt.ask("\nEnter URL to scrape",
                             default="https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue")
        else:
            # Non-interactive mode without URL - show error
            console.print("\n[red]No URL provided. Use: python table_scraper_cli.py [/red]")
            sys.exit(1)
    
    # Create scraper instance and apply command line options
    scraper = TableScraperCLI()
    if args.no_patterns:
        # Disable detection of div/list-based tables
        scraper.enable_pattern_detection = False
    if args.extract_nested:
        # Enable extraction of links and images from table cells
        scraper.extract_nested_data = True
    
    # Run the scraper
    try:
        # Only force headless mode if --headless flag was used
        headless_override = args.headless if args.headless else None
        # Execute main scraping workflow
        scraper.run(url, headless_override=headless_override)
    except KeyboardInterrupt:
        # Handle Ctrl+C gracefully
        console.print("\n[yellow]Interrupted by user[/yellow]")
    except Exception as e:
        # Display any other errors
        console.print(f"\n[red]Error: {e}[/red]")

# Entry point when script is run directly
if __name__ == "__main__":
    main()  # Execute main function

For more complex scenarios: Use BeautifulSoup for custom parsing, Selenium for JavaScript-heavy sites, or requests-html for basic dynamic content. See the full guide below.

Method 1: Using BeautifulSoup

BeautifulSoup is the most popular library for web scraping in Python. It provides a simple and intuitive API for parsing HTML and extracting data.

Basic Table Scraping

Let's start with a simple example of scraping a table from a webpage:

Basic table scraping with BeautifulSoup
# Import required libraries for web scraping and data manipulation
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Send a GET request to the webpage containing the table
url = "https://example.com/table-page"
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the first table element on the page
table = soup.find('table')

# Extract column headers from the table
headers = []
for th in table.find_all('th'):  # Find all header cells
    headers.append(th.text.strip())  # Get text and remove whitespace

# Extract data rows from the table
rows = []
for tr in table.find_all('tr')[1:]:  # Skip first row (headers)
    row = []
    for td in tr.find_all('td'):  # Find all data cells in the row
        row.append(td.text.strip())  # Get text and remove whitespace
    if row:  # Only append non-empty rows to avoid blank entries
        rows.append(row)

# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(rows, columns=headers)
print(df)  # Display the scraped table data

Handling Multiple Tables

When a page contains multiple tables, you can select specific tables using various methods:

Selecting specific tables
# Method 1: Select table by index position
tables = soup.find_all('table')  # Get all tables on the page
second_table = tables[1]  # Get the second table (0-indexed)

# Method 2: Select table by CSS class name
table = soup.find('table', {'class': 'data-table'})

# Method 3: Select table by unique ID attribute
table = soup.find('table', {'id': 'financial-data'})

# Method 4: Use CSS selector for nested elements
table = soup.select_one('div.container table.results')

Method 2: Using pandas read_html()

For simple table extraction, pandas provides a convenient read_html() function that can automatically detect and parse tables from HTML:

Using pandas read_html()
# Import pandas library for easy table reading
import pandas as pd

# Read all tables directly from a URL (automatically detects tables)
url = "https://example.com/table-page"
tables = pd.read_html(url)  # Returns a list of DataFrames

# Display information about the tables found
print(f"Found {len(tables)} tables")
df = tables[0]  # Select the first table from the list
print(df.head())  # Show the first 5 rows

# Alternative: Pass HTML content directly as a string
html_content = "<table><tr><th>Name</th><th>Age</th></tr><tr><td>John</td><td>25</td></tr></table>"
df = pd.read_html(html_content)[0]  # Parse HTML string and get first table
print(df)  # Display the resulting DataFrame

Advanced Options with read_html()

The read_html() function offers several parameters for more control:

Advanced read_html() options
# Filter tables by HTML attributes (only get tables with specific class)
df = pd.read_html(url, attrs={'class': 'wikitable'})[0]

# Use the first column as the DataFrame index
df = pd.read_html(url, index_col=0)[0]

# Skip the first 2 rows of the table (useful for headers)
df = pd.read_html(url, skiprows=2)[0]

# Automatically parse date columns into datetime objects
df = pd.read_html(url, parse_dates=True)[0]

# Handle numbers with thousands separators (e.g., "1,000")
df = pd.read_html(url, thousands=',')[0]

Method 3: Using requests-html

The requests-html library combines the simplicity of requests with JavaScript support, making it ideal for modern web pages:

Scraping with requests-html
# Import libraries for JavaScript-enabled web scraping
from requests_html import HTMLSession
import pandas as pd

# Create a new HTML session for making requests
session = HTMLSession()

# Get the webpage and render any JavaScript content
r = session.get('https://example.com/dynamic-table')
r.html.render()  # Execute JavaScript to load dynamic content

# Find all table elements on the rendered page
tables = r.html.find('table')

# Extract data from the first table only
data = []
for table in tables[:1]:  # Process only the first table
    # Extract column headers from the table
    headers = [th.text for th in table.find('th')]

    # Extract data from each row (skip header row)
    for tr in table.find('tr')[1:]:
        row = [td.text for td in tr.find('td')]  # Get text from each cell
        if row:  # Only add non-empty rows
            data.append(row)

# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=headers)
print(df)  # Display the scraped table

Handling Dynamic Tables with Selenium

For tables loaded dynamically via JavaScript or requiring user interaction, Selenium is the go-to solution:

Dynamic table scraping with Selenium
# Import Selenium components for browser automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# Initialize Chrome browser driver
driver = webdriver.Chrome()

try:
    # Navigate to the webpage containing the dynamic table
    driver.get("https://example.com/dynamic-table")

    # Wait up to 10 seconds for the table to load
    wait = WebDriverWait(driver, 10)
    table = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table.data-table"))
    )

    # Extract column headers from the table
    headers = [th.text for th in table.find_elements(By.TAG_NAME, "th")]

    # Extract data rows from the table
    rows = []
    for tr in table.find_elements(By.TAG_NAME, "tr")[1:]:  # Skip header row
        row = [td.text for td in tr.find_elements(By.TAG_NAME, "td")]
        if row:  # Only add non-empty rows
            rows.append(row)

    # Create a pandas DataFrame from the scraped data
    df = pd.DataFrame(rows, columns=headers)
    print(df)  # Display the final table

finally:
    # Always close the browser to free up resources
    driver.quit()

Handling Pagination

Many tables are paginated. Here's how to scrape all pages:

Scraping paginated tables
# Initialize list to store data from all pages
all_data = []

# Navigate to the first page of the paginated table
driver.get("https://example.com/paginated-table")

# Loop through all pages until no more pages are available
while True:
    # Wait for the table to load on the current page
    table = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table"))
    )

    # Extract data from the current page
    for tr in table.find_elements(By.TAG_NAME, "tr")[1:]:  # Skip header
        row = [td.text for td in tr.find_elements(By.TAG_NAME, "td")]
        if row:  # Only add non-empty rows
            all_data.append(row)

    # Attempt to navigate to the next page
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a.next-page")
        # Check if the next button is disabled (last page)
        if "disabled" in next_button.get_attribute("class"):
            break  # Exit loop if on the last page
        next_button.click()  # Click to go to next page
        time.sleep(2)  # Wait for the new page to load
    except:
        break  # Exit if next button not found (no more pages)

# Create final DataFrame with all collected data
df = pd.DataFrame(all_data, columns=headers)

Best Practices and Tips

1. Error Handling

Always implement proper error handling to make your scraper more robust:

Robust error handling
# Import libraries for timing and error handling
import time
from requests.exceptions import RequestException

def scrape_table_with_retry(url, max_retries=3):
    """Scrape a table with retry logic and exponential backoff"""
    for attempt in range(max_retries):
        try:
            # Make HTTP request with timeout
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise exception for bad status codes

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table')

            # Validate that a table was found
            if not table:
                raise ValueError("No table found on the page")

            # Process the table and return data
            return extract_table_data(table)

        except RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                # Wait before retrying (exponential backoff: 1s, 2s, 4s)
                time.sleep(2 ** attempt)
            else:
                # Re-raise the exception if all retries failed
                raise

2. Respect Robots.txt

Always check and respect the website's robots.txt file:

Checking robots.txt
# Import library for parsing robots.txt files
from urllib.robotparser import RobotFileParser

def can_fetch(url):
    """Check if a URL can be scraped according to robots.txt"""
    # Create a robot file parser instance
    rp = RobotFileParser()
    # Set the URL to the site's robots.txt file
    rp.set_url(url + "/robots.txt")
    # Read and parse the robots.txt file
    rp.read()
    # Check if any user agent (*) can fetch the given URL
    return rp.can_fetch("*", url)

3. Add Headers

Use appropriate headers to avoid being blocked:

Setting request headers
# Define HTTP headers to mimic a real browser request
headers = {
    # Identify as a real browser to avoid blocking
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    # Specify accepted content types
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    # Set preferred language
    'Accept-Language': 'en-US,en;q=0.5',
    # Specify accepted encoding methods
    'Accept-Encoding': 'gzip, deflate',
    # Keep connection alive for better performance
    'Connection': 'keep-alive',
}

# Make request with the custom headers
response = requests.get(url, headers=headers)

4. Rate Limiting

Implement rate limiting to avoid overwhelming the server:

Rate limiting implementation
# Import libraries for timing and efficient queue operations
import time
from collections import deque

class RateLimiter:
    """Rate limiter to control request frequency"""
    def __init__(self, calls, period):
        self.calls = calls  # Maximum number of calls allowed
        self.period = period  # Time period in seconds
        self.timestamps = deque()  # Store timestamps of recent calls

    def wait_if_needed(self):
        """Wait if necessary to respect rate limits"""
        now = time.time()

        # Remove timestamps older than the current period
        while self.timestamps and self.timestamps[0] < now - self.period:
            self.timestamps.popleft()

        # If we've hit the rate limit, wait until we can make another call
        if len(self.timestamps) >= self.calls:
            sleep_time = self.period - (now - self.timestamps[0])
            if sleep_time > 0:
                time.sleep(sleep_time)  # Wait before proceeding

        # Record the timestamp of this call
        self.timestamps.append(time.time())

# Create a rate limiter: 10 requests per minute
rate_limiter = RateLimiter(10, 60)

# Use the rate limiter when making multiple requests
for url in urls:
    rate_limiter.wait_if_needed()  # Ensure we don't exceed rate limits
    response = requests.get(url)

Common Issues and Solutions

1. Nested Tables

Handle tables within tables carefully:

Handling nested tables
# Find the main table by its CSS class
main_table = soup.find('table', {'class': 'main-table'})
# Get only direct child rows (exclude rows from nested tables)
rows = main_table.find_all('tr', recursive=False)

2. Colspan and Rowspan

Handle cells that span multiple columns or rows:

Parsing tables with spans
def parse_table_with_spans(table):
    """Parse a table that contains cells with colspan and rowspan attributes"""
    # Get all table rows
    rows = table.find_all('tr')
    # Calculate maximum number of columns needed
    max_cols = max(sum(int(td.get('colspan', 1)) for td in row.find_all(['td', 'th']))
                   for row in rows)

    # Create a 2D grid to represent the table structure
    grid = [[None for _ in range(max_cols)] for _ in range(len(rows))]

    # Process each row and cell
    for row_idx, row in enumerate(rows):
        col_idx = 0
        for cell in row.find_all(['td', 'th']):
            # Skip cells that are already occupied by spanning cells
            while col_idx < max_cols and grid[row_idx][col_idx] is not None:
                col_idx += 1

            # Get colspan and rowspan values (default to 1)
            colspan = int(cell.get('colspan', 1))
            rowspan = int(cell.get('rowspan', 1))

            # Fill the grid for all cells covered by this spanning cell
            for r in range(rowspan):
                for c in range(colspan):
                    if row_idx + r < len(rows) and col_idx + c < max_cols:
                        grid[row_idx + r][col_idx + c] = cell.text.strip()

            # Move to the next column position
            col_idx += colspan

    return grid  # Return the completed grid structure

3. JavaScript-Rendered Content

For content loaded after page load:

Waiting for JavaScript content
# Wait up to 20 seconds for specific content to appear
wait = WebDriverWait(driver, 20)
wait.until(
    # Wait for a table with a specific data attribute indicating it's loaded
    EC.presence_of_element_located((By.XPATH, "//table[@data-loaded='true']"))
)

# Alternative: Check if all JavaScript has finished executing
driver.execute_script("return document.readyState") == "complete"

Conclusion

Web scraping tables in Python can be approached in multiple ways, each with its own advantages:

Choose the method that best fits your specific use case, and always remember to scrape responsibly by respecting robots.txt, implementing rate limiting, and following the website's terms of service.

Method Speed JavaScript Support Complexity Best For
BeautifulSoup Fast No Low Static HTML tables
pandas.read_html() Very Fast No Very Low Quick extraction
requests-html Medium Yes Low Simple JS rendering
Selenium Slow Full High Complex interactions