Scrapling provides custom string and attribute handling types that extend Python’s built-in types with additional functionality for web scraping workflows.
TextHandler
Extended string type with scraping-specific methods.
Class Definition
from scrapling.core.custom_types import TextHandler
class TextHandler(str):
"""Extends standard Python string by adding more functionality"""
Additional Methods
clean
def clean(self, remove_entities: bool = False) -> TextHandler
Return a cleaned version of the string after removing whitespace and consecutive spaces.
If True, also replaces HTML entities with their characters.
Example:
text = TextHandler(" Hello\n\t World ")
print(text.clean()) # "Hello World"
html_text = TextHandler("Price: £10")
print(html_text.clean(remove_entities=True)) # "Price: £10"
def re(
self,
regex: str | Pattern,
replace_entities: bool = True,
clean_match: bool = False,
case_sensitive: bool = True,
check_match: bool = False
) -> TextHandlers | bool
Apply regex to the text and return matches.
Regular expression pattern (string or compiled).
Replace HTML entities in matches.
Clean whitespace before matching.
Whether to match case-sensitively.
If True, return bool indicating if pattern matches (no results extraction).
Returns: TextHandlers list of matches, or bool if check_match=True
Example:
text = TextHandler("Email: [email protected], Admin: [email protected]")
# Extract all emails
emails = text.re(r'\b[\w.-]+@[\w.-]+\.\w+\b')
print(emails) # TextHandlers(['[email protected]', '[email protected]'])
# Check if pattern matches
has_email = text.re(r'\b[\w.-]+@[\w.-]+\.\w+\b', check_match=True)
print(has_email) # True
# Case-insensitive matching
text = TextHandler("Hello World")
result = text.re(r'hello', case_sensitive=False)
print(result) # TextHandlers(['Hello'])
re_first
def re_first(
self,
regex: str | Pattern,
default: Any = None,
replace_entities: bool = True,
clean_match: bool = False,
case_sensitive: bool = True
) -> TextHandler
Apply regex and return the first match, or default if no match.
Regular expression pattern.
Value to return if no match is found.
Replace HTML entities in the match.
Clean whitespace before matching.
Whether to match case-sensitively.
Example:
text = TextHandler("Price: $49.99")
price = text.re_first(r'\$(\d+\.\d+)', default="0.00")
print(price) # "49.99"
no_match = text.re_first(r'€(\d+)', default="Not found")
print(no_match) # "Not found"
json
Parse the text as JSON.
Returns: Parsed JSON as dictionary
Raises: JSON parsing error if invalid
Example:
json_text = TextHandler('{"name": "John", "age": 30}')
data = json_text.json()
print(data) # {'name': 'John', 'age': 30}
sort
def sort(self, reverse: bool = False) -> TextHandler
Return a sorted version of the string.
Sort in descending order.
Example:
text = TextHandler("dcba")
print(text.sort()) # "abcd"
print(text.sort(reverse=True)) # "dcba"
Scrapy/Parsel Compatibility
For easy migration from Scrapy/Parsel:
# These methods are aliases for compatibility
text.get() # Returns self
text.get_all() # Returns self
text.extract() # Alias for get_all()
text.extract_first() # Alias for get()
TextHandlers
List of TextHandler objects with collective operations.
Class Definition
from scrapling.core.custom_types import TextHandlers
class TextHandlers(List[TextHandler]):
"""Subclass of list with additional methods for TextHandler items."""
Methods
def re(
self,
regex: str | Pattern,
replace_entities: bool = True,
clean_match: bool = False,
case_sensitive: bool = True
) -> TextHandlers
Apply regex to each element and return flattened results.
Example:
re_first
def re_first(
self,
regex: str | Pattern,
default: Any = None,
replace_entities: bool = True,
clean_match: bool = False,
case_sensitive: bool = True
) -> TextHandler
Apply regex to each element and return the first match overall.
Example:
texts = TextHandlers([
TextHandler("No number here"),
TextHandler("Price: $25.99"),
TextHandler("Sale: $19.99")
])
first_price = texts.re_first(r'\$(\d+\.\d+)')
print(first_price) # "25.99"
get
def get(self, default=None) -> TextHandler | None
Return the first element, or default if empty.
Example:
texts = TextHandlers([TextHandler("First"), TextHandler("Second")])
print(texts.get()) # "First"
empty = TextHandlers([])
print(empty.get("default")) # "default"
Scrapy/Parsel Compatibility
texts.extract() # Returns self
texts.extract_first() # Alias for get()
texts.get_all() # Alias for extract()
AttributesHandler
Read-only mapping for HTML element attributes with additional functionality.
Class Definition
from scrapling.core.custom_types import AttributesHandler
class AttributesHandler(Mapping[str, TextHandler]):
"""Read-only mapping with additional attribute handling methods."""
Constructor
def __init__(self, mapping: Any = None, **kwargs: Any) -> None
Example:
attrs = AttributesHandler({
"class": "btn btn-primary",
"id": "submit-button",
"data-action": "submit"
})
# Or with kwargs
attrs = AttributesHandler(class_="btn", id="submit-button")
Methods
get
def get(self, key: str, default: Any = None) -> TextHandler
Get an attribute value by key.
Example:
attrs = AttributesHandler({"class": "container", "id": "main"})
print(attrs.get("class")) # TextHandler("container")
print(attrs.get("missing", "default")) # "default"
print(attrs["id"]) # TextHandler("main")
search_values
def search_values(
self,
keyword: str,
partial: bool = False
) -> Generator[AttributesHandler, None, None]
Search attributes by value.
Keyword to search for in values.
If True, use substring matching. If False, require exact match.
Yields: AttributesHandler for each matching key-value pair
Example:
attrs = AttributesHandler({
"class": "btn btn-primary",
"id": "primary-button",
"data-type": "secondary"
})
# Exact match
for match in attrs.search_values("btn btn-primary"):
print(dict(match)) # {'class': 'btn btn-primary'}
# Partial match
for match in attrs.search_values("primary", partial=True):
print(dict(match))
# {'class': 'btn btn-primary'}
# {'id': 'primary-button'}
json_string
@property
def json_string(self) -> bytes
Convert attributes to JSON bytes.
Example:
attrs = AttributesHandler({"class": "container", "id": "main"})
json_bytes = attrs.json_string
print(json_bytes) # b'{"class":"container","id":"main"}'
Usage with Responses
# In spider callbacks
async def parse(self, response):
element = response.css("button").get()
# Access attributes
attrs = element.attrib
# Get specific attribute
button_class = attrs.get("class")
# Search for attributes with specific values
for match in attrs.search_values("submit", partial=True):
print(f"Found: {dict(match)}")
# Convert to dict if needed
attrs_dict = dict(attrs)
Complete Examples
Text Cleaning and Extraction
from scrapling.core.custom_types import TextHandler
html_text = TextHandler("""
Price: £49.99
Contact: [email protected]
Available: YES
""")
# Clean whitespace
cleaned = html_text.clean(remove_entities=True)
print(cleaned) # "Price: £49.99 Contact: [email protected] Available: YES"
# Extract email
email = cleaned.re_first(r'\b[\w.-]+@[\w.-]+\.\w+\b')
print(email) # "[email protected]"
# Extract price
price = cleaned.re_first(r'£([\d.]+)')
print(price) # "49.99"
# Check for availability
available = cleaned.re(r'Available:\s*(YES|NO)', check_match=True)
print(available) # True
Working with Lists
from scrapling.core.custom_types import TextHandlers, TextHandler
products = TextHandlers([
TextHandler("Product A - $29.99"),
TextHandler("Product B - $39.99"),
TextHandler("Product C - $49.99")
])
# Extract all prices
prices = products.re(r'\$(\d+\.\d+)')
print(prices) # TextHandlers(['29.99', '39.99', '49.99'])
# Get first product name
first_product = products.re_first(r'Product ([A-Z])')
print(first_product) # "A"
# Get first element
first = products.get()
print(first) # "Product A - $29.99"
Attribute Handling
from scrapling.core.custom_types import AttributesHandler
# Parse element attributes
attrs = AttributesHandler({
"class": "product-card featured",
"data-id": "12345",
"data-price": "29.99",
"data-category": "electronics"
})
# Access attributes
product_id = attrs.get("data-id")
print(product_id) # TextHandler("12345")
# Search by value
for match in attrs.search_values("electronics"):
print(dict(match)) # {'data-category': 'electronics'}
# Search with partial match
for match in attrs.search_values("product", partial=True):
print(dict(match)) # {'class': 'product-card featured'}
# Convert to JSON
import json
attrs_json = attrs.json_string
data = json.loads(attrs_json)
print(data["data-price"]) # "29.99"
Spider Integration
from scrapling.spiders import Spider
class ProductSpider(Spider):
name = "products"
start_urls = ["https://store.example.com"]
async def parse(self, response):
for product in response.css(".product"):
# Text is automatically TextHandler
title = product.css(".title::text").get()
clean_title = title.clean()
# Extract price using regex
price_text = product.css(".price::text").get()
price = price_text.re_first(r'\$(\d+\.\d+)', default="0.00")
# Work with attributes
attrs = product.attrib
product_id = attrs.get("data-id")
yield {
"title": clean_title,
"price": float(price),
"id": product_id
}
Type Hints
For type checking in your code:
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
from typing import Dict
def process_text(text: TextHandler) -> str:
return text.clean()
def process_list(texts: TextHandlers) -> TextHandler:
return texts.get("default")
def process_attrs(attrs: AttributesHandler) -> Dict[str, str]:
return dict(attrs)
See Also
- Response - Response objects return these types
- Spider - Using custom types in spider callbacks