Skip to main content

Overview

Scrapling provides powerful methods to extract data from HTML elements. Whether you need text content, HTML markup, attributes, or structured data, Scrapling has you covered.

Basic Extraction

get()

Serialize an element to a string.
def get() -> TextHandler
For text nodes, returns the text value. For HTML elements, returns the outer HTML.
from scrapling import Fetcher

page = Fetcher.fetch('https://example.com')

# Get single element content
header = page.css('h1').first
if header:
    content = header.get()
    print(content)  # Returns HTML: <h1>Title</h1>

getall()

Extract data from all matched elements.
def getall() -> TextHandlers
For Selector (single element): Returns a single-element list containing the element’s serialized string. For Selectors (multiple elements): Serializes all elements and returns as a TextHandlers list.
# Single element returns list with one item
element = page.css('h1').first
result = element.getall()
print(result)  # ['<h1>Title</h1>']

extract() and extract_first()

Aliases for backward compatibility with other scraping libraries.
extract = getall
extract_first = get
# These are equivalent
text1 = page.css('p').extract_first()
text2 = page.css('p').get()

# These are equivalent
texts1 = page.css('p').extract()
texts2 = page.css('p').getall()

Text Extraction

text

Get the direct text content of an element.
@property
def text() -> TextHandler
Returns the text content of the element. For text nodes, returns the text value. For HTML elements, returns the element’s direct text (not including children).
# Get text content
title = page.css('h1').first
if title:
    print(title.text)  # Returns: "Title" (not <h1>Title</h1>)

# Text is a TextHandler with useful methods
price_text = page.css('.price').first.text
price_clean = price_text.strip().replace('$', '')
price_value = float(price_clean)

get_all_text()

Get all text from an element and its descendants.
def get_all_text(
    separator: str = "\n",
    strip: bool = False,
    ignore_tags: Tuple = ("script", "style"),
    valid_values: bool = True,
) -> TextHandler
separator
str
default:"\\n"
Strings will be concatenated using this separator
strip
bool
default:"false"
If True, strings will be stripped before being concatenated
ignore_tags
Tuple
default:"('script', 'style')"
A tuple of tag names to ignore when extracting text
valid_values
bool
default:"true"
If enabled, elements with text-content that is empty or only whitespaces will be ignored
# Get all text from a container
article = page.css('article').first
full_text = article.get_all_text()
print(full_text)

HTML Extraction

html_content

Get the inner HTML of an element.
@property
def html_content() -> TextHandler
Returns the inner HTML code of the element.
# Get inner HTML
container = page.css('.container').first
inner_html = container.html_content
print(inner_html)  # HTML inside the container

prettify()

Get a formatted, prettified version of the HTML.
def prettify() -> TextHandler
Returns a prettified version of the element’s inner HTML code with proper indentation.
# Get prettified HTML
element = page.css('div.complex').first
pretty_html = element.prettify()
print(pretty_html)
# Output:
# <div class="complex">
#   <p>Text</p>
#   <span>More text</span>
# </div>

body

Get the raw body without processing.
@property
def body() -> str | bytes
Returns the raw body of the current Selector without any processing. Useful for binary and non-HTML requests.
# Get raw body
raw_content = page.body

Attribute Extraction

attrib

Access element attributes.
@property
def attrib() -> AttributesHandler
Returns an AttributesHandler containing all attributes of the element.
# Get specific attribute
link = page.css('a').first
href = link.attrib.get('href')
class_name = link.attrib.get('class')

# Check if attribute exists
if 'data-id' in link.attrib:
    data_id = link['data-id']

Element Access with []

Direct attribute access using bracket notation.
def __getitem__(key: str) -> TextHandler
# Direct access
link = page.css('a').first
href = link['href']
class_name = link['class']

# Raises KeyError if attribute doesn't exist
try:
    missing = link['data-missing']
except KeyError:
    print("Attribute not found")

Check Attribute Existence

def __contains__(key: str) -> bool
# Check if attribute exists
link = page.css('a').first

if 'href' in link:
    print(f"Link points to: {link['href']}")

if 'target' in link:
    print(f"Opens in: {link['target']}")
else:
    print("Opens in same window")

Tag Information

tag

Get the tag name of an element.
@property
def tag() -> str
# Get tag name
element = page.css('.content').first
print(element.tag)  # e.g., "div"

# For text nodes
text_node = page.xpath('//p/text()').first
print(text_node.tag)  # "#text"

has_class()

Check if an element has a specific class.
def has_class(class_name: str) -> bool
class_name
str
required
The class name to check for
# Check for class
element = page.css('.container').first

if element.has_class('active'):
    print("Element is active")

if element.has_class('container'):
    print("Element is a container")

Regex Extraction

Extract data using regular expressions.

re()

Apply regex to element text and return all matches.
def re(
    regex: str | Pattern[str],
    replace_entities: bool = True,
    clean_match: bool = False,
    case_sensitive: bool = True,
) -> TextHandlers
regex
str | Pattern[str]
required
Can be either a compiled regular expression or a string
replace_entities
bool
default:"true"
If enabled, character entity references are replaced by their corresponding character
clean_match
bool
default:"false"
If enabled, ignores all whitespaces and consecutive spaces while matching
case_sensitive
bool
default:"true"
If disabled, the function will ignore letter case while matching
import re

# Extract all prices
page_text = page.get_all_text()
prices = page_text.re(r'\$\d+\.\d{2}')
print(prices)  # ['$19.99', '$29.99', '$9.99']

re_first()

Apply regex and return the first match.
def re_first(
    regex: str | Pattern[str],
    default=None,
    replace_entities: bool = True,
    clean_match: bool = False,
    case_sensitive: bool = True,
) -> TextHandler
default
Any
default:"None"
The default value to be returned if there is no match
import re

# Get first price or default
price = page.get_all_text().re_first(r'\$\d+\.\d{2}', default='$0.00')

# Extract product code
code = page.css('.product-info').first.text.re_first(
    r'SKU: ([A-Z0-9-]+)',
    default='N/A'
)

JSON Extraction

Extract and parse JSON data.

json()

Parse element content as JSON.
def json() -> Dict
Returns a parsed JSON response if the content is valid JSON, otherwise raises an error.
# Parse JSON from API response
response = Fetcher.fetch('https://api.example.com/data')
data = response.json()

print(data['items'])

URL Handling

urljoin()

Join relative URLs with the page’s base URL.
def urljoin(relative_url: str) -> str
relative_url
str
required
The relative URL to join with the page’s URL
# Convert relative URLs to absolute
links = page.css('a')

for link in links:
    relative_href = link.attrib.get('href', '')
    absolute_url = link.urljoin(relative_href)
    print(absolute_url)

# Example:
# page.url = 'https://example.com/products/'
# relative = '../about'
# result = 'https://example.com/about'

Practical Examples

Extract Product Information

# Extract structured product data
products = []

for item in page.css('.product-card'):
    product = {
        'name': item.css('.product-name').first.text.strip(),
        'price': item.css('.price').first.text.re_first(r'\d+\.\d{2}'),
        'image': item.css('img').first['src'],
        'url': item.urljoin(item.css('a').first['href']),
        'in_stock': item.has_class('in-stock'),
        'rating': float(item.css('[data-rating]').first['data-rating'])
    }
    products.append(product)

print(f"Found {len(products)} products")

Extract Article Content

# Extract article with metadata
article = page.css('article').first

if article:
    data = {
        'title': article.css('h1').first.text.strip(),
        'author': article.css('.author').first.text.strip(),
        'date': article.css('time').first['datetime'],
        'content': article.css('.content').first.get_all_text(
            separator='\n\n',
            strip=True,
            ignore_tags=('script', 'style', 'aside')
        ),
        'tags': [tag.text for tag in article.css('.tag')],
        'image_url': article.css('img').first['src'] if article.css('img') else None
    }

Extract Table Data

# Extract data from HTML table
table = page.css('table.data-table').first

if table:
    headers = [th.text.strip() for th in table.css('thead th')]
    
    rows = []
    for tr in table.css('tbody tr'):
        row = {}
        cells = tr.css('td')
        for i, cell in enumerate(cells):
            header = headers[i] if i < len(headers) else f'column_{i}'
            row[header] = cell.text.strip()
        rows.append(row)
    
    print(f"Extracted {len(rows)} rows")

Extract Contact Information

import re

# Extract various contact information
contact_section = page.css('.contact-info').first

if contact_section:
    text = contact_section.get_all_text()
    
    contact = {
        'emails': text.re(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
        'phones': text.re(r'\+?\d[\d\s\-\(\)]{8,}\d'),
        'address': contact_section.css('.address').first.text if contact_section.css('.address') else None,
    }

Build docs developers (and LLMs) love