Skip to main content
Proper error handling is crucial for building reliable scrapers. Scrapling provides multiple layers of error handling from automatic retries to custom error callbacks.

Fetcher-Level Error Handling

Automatic Retries

All fetchers automatically retry failed requests:
from scrapling import StealthyFetcher

response = StealthyFetcher.fetch(
    'https://example.com',
    retries=5,        # Retry up to 5 times (default: 3)
    retry_delay=2     # Wait 2 seconds between retries (default: 1)
)
Configuration: scrapling/engines/_browsers/_validators.py:88-89

Try-Catch Pattern

Handle exceptions manually:
from scrapling import StealthyFetcher
import logging

try:
    response = StealthyFetcher.fetch('https://example.com')
    print(response.status)
except TimeoutError as e:
    logging.error(f"Request timed out: {e}")
except RuntimeError as e:
    logging.error(f"Request failed: {e}")
except Exception as e:
    logging.error(f"Unexpected error: {e}")

Proxy Errors

Detect and handle proxy-related errors:
from scrapling import StealthySession, is_proxy_error

try:
    with StealthySession(proxy='http://bad-proxy:8080') as session:
        response = session.fetch('https://example.com')
except Exception as e:
    if is_proxy_error(e):
        print("Proxy connection failed")
        # Try different proxy
    else:
        print(f"Other error: {e}")
Proxy error indicators:
PROXY_ERROR_INDICATORS = {
    "net::err_proxy",
    "net::err_tunnel",
    "connection refused",
    "connection reset",
    "connection timed out",
    "failed to connect",
    "could not resolve proxy",
}
Source: scrapling/engines/toolbelt/proxy_rotation.py:7-15, 27-30

Spider Error Handling

On Error Hook

Handle errors at the spider level:
from scrapling import Spider

class MySpider(Spider):
    name = 'error_handler'
    start_urls = ['https://example.com']
    
    async def on_error(self, request, error):
        """Called when a request fails after all retries"""
        self.logger.error(f"Request failed: {request.url}")
        self.logger.error(f"Error: {error}")
        
        # Log to file
        with open('errors.log', 'a') as f:
            f.write(f"{request.url}: {error}\n")
        
        # Track in meta
        error_type = type(error).__name__
        self.stats.custom_stats[error_type] = \
            self.stats.custom_stats.get(error_type, 0) + 1
    
    async def parse(self, response):
        yield {'title': response.css('title::text').get()}
Source: scrapling/spiders/spider.py:178-184

Graceful Degradation

Extract partial data even when errors occur:
class RobustSpider(Spider):
    name = 'robust'
    start_urls = ['https://example.com']
    
    async def parse(self, response):
        item = {'url': response.url}
        
        # Try to extract each field independently
        try:
            item['title'] = response.css('h1::text').get()
        except Exception as e:
            self.logger.warning(f"Failed to extract title: {e}")
            item['title'] = None
        
        try:
            item['price'] = response.css('.price::text').get()
        except Exception as e:
            self.logger.warning(f"Failed to extract price: {e}")
            item['price'] = None
        
        try:
            item['description'] = response.css('.desc::text').get()
        except Exception as e:
            self.logger.warning(f"Failed to extract description: {e}")
            item['description'] = None
        
        yield item

Error Statistics

Track error rates:
class StatsSpider(Spider):
    name = 'stats_spider'
    start_urls = ['https://example.com']
    
    async def on_error(self, request, error):
        error_type = type(error).__name__
        
        # Track error types
        if not hasattr(self, 'error_counts'):
            self.error_counts = {}
        
        self.error_counts[error_type] = \
            self.error_counts.get(error_type, 0) + 1
    
    async def on_close(self):
        """Print error summary"""
        if hasattr(self, 'error_counts'):
            self.logger.info("Error Summary:")
            for error_type, count in self.error_counts.items():
                self.logger.info(f"  {error_type}: {count}")
        
        # Calculate error rate
        total_requests = self.stats.request_count
        total_errors = self.stats.error_count
        
        if total_requests > 0:
            error_rate = (total_errors / total_requests) * 100
            self.logger.info(f"Error rate: {error_rate:.2f}%")
    
    async def parse(self, response):
        yield {'data': response.css('.content::text').get()}

Logging Configuration

Custom Log Levels

import logging

class VerboseSpider(Spider):
    name = 'verbose'
    start_urls = ['https://example.com']
    
    # Logging configuration
    logging_level = logging.DEBUG  # Show all logs
    logging_format = "[%(asctime)s] %(levelname)s: %(message)s"
    logging_date_format = "%Y-%m-%d %H:%M:%S"
    
    async def parse(self, response):
        self.logger.debug(f"Parsing {response.url}")
        yield {'data': response.text}
Source: scrapling/spiders/spider.py:87-89

Log to File

class FileLogSpider(Spider):
    name = 'file_logger'
    start_urls = ['https://example.com']
    
    log_file = 'spider.log'  # Log to file
    logging_level = logging.INFO
    
    async def parse(self, response):
        self.logger.info(f"Scraped {response.url}")
        yield {'title': response.css('title::text').get()}
Source: scrapling/spiders/spider.py:90, 118-122

Log Counting

Track log message counts:
class LogCountSpider(Spider):
    name = 'log_counter'
    start_urls = ['https://example.com']
    
    async def on_close(self):
        # Access log counts
        counts = self._log_counter.get_counts()
        
        self.logger.info("Log Summary:")
        self.logger.info(f"  Debug: {counts['debug']}")
        self.logger.info(f"  Info: {counts['info']}")
        self.logger.info(f"  Warning: {counts['warning']}")
        self.logger.info(f"  Error: {counts['error']}")
        self.logger.info(f"  Critical: {counts['critical']}")
    
    async def parse(self, response):
        self.logger.info("Processing response")
        yield {'data': response.text}
Source: scrapling/spiders/spider.py:21-56, 110-112

Session Errors

Session Not Started

from scrapling import StealthySession

session = StealthySession()

try:
    # This will fail - session not started
    response = session.fetch('https://example.com')
except RuntimeError as e:
    print(f"Error: {e}")
    # Use context manager instead
    with session:
        response = session.fetch('https://example.com')

Context Manager Closed

with StealthySession() as session:
    response = session.fetch('https://example.com')

# Session is closed here
try:
    response = session.fetch('https://another.com')
except RuntimeError as e:
    print("Session closed")
Source: scrapling/engines/_browsers/_stealth.py:212-213

Session Configuration Errors

from scrapling import Spider, SessionConfigurationError

class BadConfigSpider(Spider):
    name = 'bad_config'
    
    def configure_sessions(self, manager):
        # This will raise SessionConfigurationError
        raise ValueError("Invalid configuration")
    
    async def parse(self, response):
        yield {}

try:
    spider = BadConfigSpider()
except SessionConfigurationError as e:
    print(f"Session configuration failed: {e}")
Source: scrapling/spiders/spider.py:59-62, 130-137

Validation Errors

Invalid Arguments

from scrapling import StealthyFetcher

try:
    response = StealthyFetcher.fetch(
        'https://example.com',
        timeout=-1000  # Invalid: negative timeout
    )
except TypeError as e:
    print(f"Invalid argument: {e}")

Type Validation

try:
    response = StealthyFetcher.fetch(
        'https://example.com',
        selector_config="not a dict"  # Invalid: must be dict
    )
except TypeError as e:
    print(f"Type error: {e}")
Source: scrapling/fetchers/stealth_chrome.py:54-55

Proxy Validation

from scrapling import ProxyRotator

try:
    # Invalid: dict without 'server' key
    rotator = ProxyRotator([{'username': 'user'}])
except ValueError as e:
    print(f"Invalid proxy config: {e}")

try:
    # Invalid: empty list
    rotator = ProxyRotator([])
except ValueError as e:
    print(f"At least one proxy required: {e}")
Source: scrapling/engines/toolbelt/proxy_rotation.py:64-84

Timeout Handling

Request Timeouts

from scrapling import StealthyFetcher
import asyncio

try:
    response = StealthyFetcher.fetch(
        'https://very-slow-site.com',
        timeout=5000  # 5 second timeout
    )
except TimeoutError as e:
    print("Request timed out")

Page Pool Timeouts

from scrapling import StealthySession

try:
    with StealthySession(max_pages=1) as session:
        # This will timeout if page pool is full
        response = session.fetch('https://example.com')
except TimeoutError as e:
    print("No available pages in pool")
Source: scrapling/engines/_browsers/_base.py:275-283

Complete Error Handling Example

from scrapling import Spider, StealthySession, ProxyRotator
import asyncio
import logging
from datetime import datetime

class ProductionSpider(Spider):
    name = 'production'
    start_urls = ['https://example.com']
    
    # Concurrency
    concurrent_requests = 8
    max_blocked_retries = 3
    
    # Logging
    logging_level = logging.INFO
    log_file = f'logs/{datetime.now():%Y%m%d_%H%M%S}.log'
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.error_counts = {}
        self.failed_urls = []
    
    def configure_sessions(self, manager):
        try:
            rotator = ProxyRotator([
                'http://proxy1:8080',
                'http://proxy2:8080',
            ])
            
            manager.add('default', StealthySession(
                proxy_rotator=rotator,
                retries=5,
                retry_delay=2,
                timeout=30000,
            ))
        except Exception as e:
            self.logger.error(f"Session configuration failed: {e}")
            raise
    
    async def on_error(self, request, error):
        """Log and track errors"""
        # Track error type
        error_type = type(error).__name__
        self.error_counts[error_type] = \
            self.error_counts.get(error_type, 0) + 1
        
        # Save failed URL
        self.failed_urls.append({
            'url': request.url,
            'error': str(error),
            'error_type': error_type,
            'timestamp': datetime.now().isoformat(),
        })
        
        # Log error
        self.logger.error(
            f"Request failed: {request.url} | "
            f"Error: {error_type} - {error}"
        )
        
        # Alert on critical errors
        if error_type in {'RuntimeError', 'TimeoutError'}:
            await self._send_alert(request, error)
    
    async def _send_alert(self, request, error):
        """Send alert for critical errors"""
        # Your alerting logic (email, Slack, etc.)
        pass
    
    async def is_blocked(self, response):
        """Detect blocked requests"""
        if response.status in {403, 429, 503}:
            return True
        
        blocked_phrases = ['captcha', 'access denied', 'rate limit']
        content_lower = response.text.lower()
        
        return any(phrase in content_lower for phrase in blocked_phrases)
    
    async def retry_blocked_request(self, request, response):
        """Modify request before retry"""
        retry_count = request.meta.get('retry_count', 0)
        
        self.logger.warning(
            f"Request blocked: {request.url} | "
            f"Retry {retry_count + 1}/{self.max_blocked_retries}"
        )
        
        # Escalating delays
        await asyncio.sleep(5 * (retry_count + 1))
        
        request.meta['retry_count'] = retry_count + 1
        return request
    
    async def parse(self, response):
        """Parse with error handling"""
        item = {'url': response.url}
        
        try:
            # Extract data with fallbacks
            item['title'] = response.css('h1::text').get() or \
                          response.css('title::text').get() or \
                          'No title'
            
            item['content'] = response.css('.content::text').getall()
            
            # Validate item
            if not item['content']:
                self.logger.warning(
                    f"Empty content for {response.url}"
                )
            
            yield item
            
        except Exception as e:
            self.logger.error(
                f"Parsing failed for {response.url}: {e}"
            )
            # Yield partial data
            yield item
    
    async def on_close(self):
        """Print summary and save failed URLs"""
        self.logger.info("\n" + "="*50)
        self.logger.info("Spider Summary")
        self.logger.info("="*50)
        
        # Stats
        self.logger.info(f"Total requests: {self.stats.request_count}")
        self.logger.info(f"Successful: {self.stats.success_count}")
        self.logger.info(f"Failed: {self.stats.error_count}")
        self.logger.info(f"Items scraped: {self.stats.item_count}")
        
        # Error breakdown
        if self.error_counts:
            self.logger.info("\nError Breakdown:")
            for error_type, count in sorted(
                self.error_counts.items(),
                key=lambda x: x[1],
                reverse=True
            ):
                self.logger.info(f"  {error_type}: {count}")
        
        # Log counts
        log_counts = self._log_counter.get_counts()
        self.logger.info("\nLog Counts:")
        self.logger.info(f"  Warnings: {log_counts['warning']}")
        self.logger.info(f"  Errors: {log_counts['error']}")
        
        # Save failed URLs
        if self.failed_urls:
            import json
            
            with open('failed_urls.json', 'w') as f:
                json.dump(self.failed_urls, f, indent=2)
            
            self.logger.info(
                f"\nSaved {len(self.failed_urls)} failed URLs to "
                "failed_urls.json"
            )

if __name__ == '__main__':
    try:
        spider = ProductionSpider()
        result = spider.start(use_uvloop=True)
        
        print(f"\nScraped {len(result.items)} items")
        
        if result.paused:
            print("Spider was paused - resume with same command")
        
    except KeyboardInterrupt:
        print("\nSpider interrupted by user")
    except Exception as e:
        print(f"\nFatal error: {e}")
        logging.exception("Spider crashed")

Best Practices

Always use context managers for sessions:
# Good
with StealthySession() as session:
    response = session.fetch(url)

# Bad
session = StealthySession()
session.start()
response = session.fetch(url)
session.close()  # Easy to forget!
Comprehensive logging helps debugging:
self.logger.info(f"Fetching {url}")
self.logger.warning(f"Retry {attempt}")
self.logger.error(f"Failed: {error}")
Monitor error rates to detect issues:
error_rate = errors / total_requests
if error_rate > 0.3:  # 30%
    self.logger.warning("High error rate!")
Extract partial data when possible:
try:
    item = {'required': extract_required()}
except:
    return None  # Skip item

try:
    item['optional'] = extract_optional()
except:
    item['optional'] = None  # Keep item

Handling Blocked Requests

Detect and retry blocked requests

Performance Optimization

Optimize retry strategies

Anti-Bot Bypass

Avoid errors with better stealth

Cloudflare Turnstile

Handle Cloudflare challenges

Build docs developers (and LLMs) love