From 2f180ceed71c966cc869c063620e48e5137506ea Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 15 Jul 2025 16:08:47 +0200 Subject: [PATCH 01/11] Docs: Add architecture overview guide and more Closes: #603 Closes: #766 --- docs/guides/architecture_overview.mdx | 425 ++++++++++++++++++ .../http_crawlers/beautifulsoup_example.py | 35 ++ .../http_crawlers/custom_crawler_example.py | 0 .../http_crawlers/http_example.py | 52 +++ .../http_crawlers/parsel_example.py | 35 ++ ...tandem_example.py => rl_tandem_example.py} | 0 ...licit.py => rl_tandem_example_explicit.py} | 0 .../request_loaders/sitemap_tandem_example.py | 37 ++ .../sitemap_tandem_example_explicit.py | 40 ++ .../adaptive_crawler_handlers.py | 57 +++ .../request_router/advanced_routing.py | 0 .../request_router/basic_request_handlers.py | 92 ++++ .../request_router/builtin_router_approach.py | 0 .../request_router/custom_router_approach.py | 0 .../custom_router_default_only.py | 44 ++ .../request_router/error_handler.py | 60 +++ .../request_router/error_handlers.py | 0 .../request_router/failed_request_handler.py | 64 +++ .../request_router/http_pre_navigation.py | 30 ++ .../playwright_pre_navigation.py | 62 +++ .../request_router/simple_default_handler.py | 34 ++ .../service_locator/service_conflicts.py | 22 + .../service_crawler_configuration.py | 22 + .../service_crawler_event_manager.py | 20 + .../service_crawler_storage_client.py | 17 + .../service_locator_configuration.py | 20 + .../service_locator_event_manager.py | 18 + .../service_locator_storage_client.py | 15 + .../service_storage_configuration.py | 22 + .../service_storage_storage_client.py | 17 + ...=> registering_storage_clients_example.py} | 0 docs/guides/http_clients.mdx | 76 +++- docs/guides/http_crawlers.mdx | 95 +++- docs/guides/playwright_crawler.mdx | 2 +- docs/guides/playwright_crawler_adaptive.mdx | 4 +- docs/guides/request_loaders.mdx | 83 ++-- docs/guides/request_router.mdx | 110 +++++ docs/guides/service_locator.mdx | 136 ++++++ docs/guides/storage_clients.mdx | 113 +++-- docs/guides/storages.mdx | 40 +- website/sidebars.js | 4 + 41 files changed, 1789 insertions(+), 114 deletions(-) create mode 100644 docs/guides/architecture_overview.mdx create mode 100644 docs/guides/code_examples/http_crawlers/beautifulsoup_example.py create mode 100644 docs/guides/code_examples/http_crawlers/custom_crawler_example.py create mode 100644 docs/guides/code_examples/http_crawlers/http_example.py create mode 100644 docs/guides/code_examples/http_crawlers/parsel_example.py rename docs/guides/code_examples/request_loaders/{tandem_example.py => rl_tandem_example.py} (100%) rename docs/guides/code_examples/request_loaders/{tandem_example_explicit.py => rl_tandem_example_explicit.py} (100%) create mode 100644 docs/guides/code_examples/request_loaders/sitemap_tandem_example.py create mode 100644 docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py create mode 100644 docs/guides/code_examples/request_router/adaptive_crawler_handlers.py create mode 100644 docs/guides/code_examples/request_router/advanced_routing.py create mode 100644 docs/guides/code_examples/request_router/basic_request_handlers.py create mode 100644 docs/guides/code_examples/request_router/builtin_router_approach.py create mode 100644 docs/guides/code_examples/request_router/custom_router_approach.py create mode 100644 docs/guides/code_examples/request_router/custom_router_default_only.py create mode 100644 docs/guides/code_examples/request_router/error_handler.py create mode 100644 docs/guides/code_examples/request_router/error_handlers.py create mode 100644 docs/guides/code_examples/request_router/failed_request_handler.py create mode 100644 docs/guides/code_examples/request_router/http_pre_navigation.py create mode 100644 docs/guides/code_examples/request_router/playwright_pre_navigation.py create mode 100644 docs/guides/code_examples/request_router/simple_default_handler.py create mode 100644 docs/guides/code_examples/service_locator/service_conflicts.py create mode 100644 docs/guides/code_examples/service_locator/service_crawler_configuration.py create mode 100644 docs/guides/code_examples/service_locator/service_crawler_event_manager.py create mode 100644 docs/guides/code_examples/service_locator/service_crawler_storage_client.py create mode 100644 docs/guides/code_examples/service_locator/service_locator_configuration.py create mode 100644 docs/guides/code_examples/service_locator/service_locator_event_manager.py create mode 100644 docs/guides/code_examples/service_locator/service_locator_storage_client.py create mode 100644 docs/guides/code_examples/service_locator/service_storage_configuration.py create mode 100644 docs/guides/code_examples/service_locator/service_storage_storage_client.py rename docs/guides/code_examples/storage_clients/{registering_storage_client_example.py => registering_storage_clients_example.py} (100%) create mode 100644 docs/guides/request_router.mdx create mode 100644 docs/guides/service_locator.mdx diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx new file mode 100644 index 0000000000..3f08fb1093 --- /dev/null +++ b/docs/guides/architecture_overview.mdx @@ -0,0 +1,425 @@ +--- +id: architecture-overview +title: Architecture overview +description: An overview of the core components of the Crawlee library and its architecture. +--- + +import ApiLink from '@site/src/components/ApiLink'; + +Crawlee is a modern and modular web scraping framework. It is designed for both HTTP and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system. + +## Crawler + +The core component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the `BasicCrawler` class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers. + +:::info + +You will learn more about the request handlers in the request router section. + +::: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class BasicCrawler { + <> +} + +class AbstractHttpCrawler { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class ParselCrawler + +class BeautifulSoupCrawler + +class PlaywrightCrawler + +class AdaptivePlaywrightCrawler + +%% ======================== +%% Inheritance arrows +%% ======================== + +BasicCrawler <|-- AbstractHttpCrawler +BasicCrawler <|-- PlaywrightCrawler +BasicCrawler <|-- AdaptivePlaywrightCrawler +AbstractHttpCrawler <|-- HttpCrawler +AbstractHttpCrawler <|-- ParselCrawler +AbstractHttpCrawler <|-- BeautifulSoupCrawler +``` + +### HTTP crawlers + +HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients). + +HTTP crawlers inherit from `AbstractHttpCrawler` and there are three crawlers that belong to this category: + +- `BeautifulSoupCrawler` utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser. +- `ParselCrawler` utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML. +- `HttpCrawler` does not parse HTTP responses at all and is used when no content parsing is required. + +You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers). + +### Browser crawlers + +Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is `PlaywrightCrawler`, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about `PlaywrightCrawler`, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler). + +### Adaptive crawler + +The `AdaptivePlaywrightCrawler` sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler). + +## Crawling contexts + +Crawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, and other information related to the request. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Classes +%% ======================== + +class BasicCrawlingContext { + <> +} + +class HttpCrawlingContext { + <> +} + +class HttpCrawlingResult { + <> +} + +class ParsedHttpCrawlingContext { + <> +} + +class ParselCrawlingContext { + <> +} + +class BeautifulSoupCrawlingContext { + <> +} + +class PlaywrightPreNavCrawlingContext { + <> +} + +class PlaywrightCrawlingContext { + <> +} + +class AdaptivePlaywrightPreNavCrawlingContext { + <> +} + +class AdaptivePlaywrightCrawlingContext { + <> +} + +%% ======================== +%% Inheritance arrows +%% ======================== + +BasicCrawlingContext --|> HttpCrawlingContext + +HttpCrawlingResult --|> HttpCrawlingContext + +HttpCrawlingContext --|> ParsedHttpCrawlingContext + +ParsedHttpCrawlingContext --|> ParselCrawlingContext + +ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext + +BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext + +PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext + +BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext + +ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext +``` + +They have a similar inheritance structure as the crawlers, with the base class being `BasicCrawlingContext`. The specific crawling contexts are: +- `HttpCrawlingContext` for HTTP crawlers. +- `ParsedHttpCrawlingContext` for HTTP crawlers with parsed responses. +- `ParselCrawlingContext` for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing. +- `BeautifulSoupCrawlingContext` for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing. +- `PlaywrightPreNavCrawlingContext` for Playwright crawlers before the page is navigated. +- `PlaywrightCrawlingContext` for Playwright crawlers. +- `AdaptivePlaywrightPreNavCrawlingContext` for Adaptive Playwright crawlers before the page is navigated. +- `AdaptivePlaywrightCrawlingContext` for Adaptive Playwright crawlers. + +## Storages + +Storages are the components that manage data in Crawlee. They provide a way to store and retrieve data during the crawling process. Crawlee's storage system consists of two main layers: + +- **Storages**: High-level interfaces for interacting with different storage types +- **Storage clients**: Backend implementations that handle the actual data persistence and management (you will learn more about them in the next section) + +Crawlee provides three built-in storage types for managing data: + +- `Dataset` - Append-only, tabular storage that stores structured results (e.g., scraped data). +- `KeyValueStore` - Stores arbitrary data like JSON documents, images, configs, or state. +- `RequestQueue` - Manages pending and handled requests. + +See the [Storages guide](./storages) for more details. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class Storage { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class Dataset + +class KeyValueStore + +class RequestQueue + +%% ======================== +%% Inheritance arrows +%% ======================== + +Storage <|-- Dataset +Storage <|-- KeyValueStore +Storage <|-- RequestQueue +``` + +## Storage clients + +Storage clients are the backend implementations for storages that handle interactions with different storage systems. They provide a unified interface for `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. + +Crawlee provides several built-in storage client implementations: + +- `MemoryStorageClient` - Stores data in memory with no persistence (ideal for testing and fast operations). +- `FileSystemStorageClient` - Provides persistent file system storage with caching (default client). +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You can find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction). + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class StorageClient { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class MemoryStorageClient + +class FileSystemStorageClient + +class ApifyStorageClient + +%% ======================== +%% Inheritance arrows +%% ======================== + +StorageClient <|-- MemoryStorageClient +StorageClient <|-- FileSystemStorageClient +StorageClient <|-- ApifyStorageClient +``` + +Storage clients can be registered globally with the `ServiceLocator` (you will learn more about the `ServiceLocator` in the next section), passed directly to crawlers, or specified when opening individual storage instances. You can also create custom storage clients by implementing the `StorageClient` interface. + +See the [Storage clients guide](./storage-clients) for more details. + +## Request router + +The request `Router` is a central component that manages the flow of requests and responses in Crawlee. It is responsible for routing requests to the appropriate request handlers, managing the crawling context, and coordinating the execution of user-defined logic. + +### Request handlers + +Request handlers are user-defined functions that process requests and responses in Crawlee. They are the core of the crawling logic and are responsible for handling data extraction, processing, and storage. Each request handler receives a crawling context as an argument, which provides access to request data, response data, and other information related to the request. Request handlers can be registered with the `Router`. + +The request routing in Crawlee supports: +- Default handlers - Fallback handlers for requests without specific labels. +- Label-based routing - Handlers for specific request types based on labels. +- Error handlers - Handle errors during request processing. +- Failed request handlers - Handle requests that exceed retry limits. +- Pre-navigation hooks - Execute logic before navigating to URLs. + +See the [Request router guide](./request-router) for detailed information and examples. + +## Service locator + +The `ServiceLocator` is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator acts as a dependency injection container that coordinates three essential services: + +- `Configuration` - Application-wide settings and parameters that control various aspects of Crawlee behavior. +- `StorageClient` - Backend implementation for data storage across datasets, key-value stores, and request queues. +- `EventManager` - Event coordination system for internal framework events and custom user hooks. + +Services can be registered globally through the `service_locator` singleton instance, passed to crawler constructors, or provided when opening individual storage instances. The service locator includes conflict prevention mechanisms to ensure configuration consistency and prevent accidental service conflicts during runtime. + +See the [Service locator guide](./service-locator) for detailed information about service registration and configuration options. + +## Request loaders + +Request loaders provide a subset of `RequestQueue` functionality, focusing specifically on reading and accessing streams of requests from various sources. They define how requests are fetched and processed, enabling use cases such as reading URLs from files, external APIs, sitemaps, or combining multiple sources together. Unlike request queues, they do not handle storage or persistence—they only provide request reading capabilities. + +- `RequestLoader` - Base interface for read-only access to a stream of requests, with capabilities like fetching the next request, marking as handled, and status checking. +- `RequestList` - Lightweight in-memory implementation of `RequestLoader` for managing static lists of URLs. +- `SitemapRequestLoader` - Specialized loader for reading URLs from XML sitemaps with filtering capabilities. + +### Request managers + +`RequestManager` extends `RequestLoader` with write capabilities for adding and reclaiming requests, providing full request management functionality. `RequestQueue` is the primary concrete implementation of `RequestManager`. + +`RequestManagerTandem` combines a read-only `RequestLoader` with a writable `RequestManager`, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager. + +Request loaders are useful when you need to start with a predefined set of URLs. The tandem approach allows processing requests from static sources (like files or sitemaps) while maintaining the ability to add new requests dynamically. + +See the [Request loaders guide](./request-loaders) for detailed information. + +## Event manager + +The `EventManager` is responsible for coordinating internal events throughout Crawlee and enabling custom hooks. It provides a system for registering event listeners, emitting events, and managing their execution lifecycle. + +Crawlee provides several implementations of the event manager: + +- `EventManager` is the base class for event management in Crawlee. +- `LocalEventManager` extends the base event manager for local environments by automatically emitting `SYSTEM_INFO` events at regular intervals. This provides real-time system metrics including CPU usage and memory consumption, which are essential for internal components like the `Snapshotter` and `AutoscaledPool`. +- [`ApifyEventManager`](https://docs.apify.com/sdk/python/reference/class/PlatformEventManager) - Manages events on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://docs.apify.com/sdk/python/). + +:::info + +You can learn more about `Snapshotter` and `AutoscaledPool` and their configuration in the [Scaling crawlers guide](./scaling-crawlers). + +::: + +Crawlee defines several built-in event types: + +- `PERSIST_STATE` - Emitted periodically to trigger state persistence. +- `SYSTEM_INFO` - Contains CPU and memory usage information. +- `MIGRATING` - Signals that the crawler is migrating to a different environment. +- `ABORTING` - Indicates the crawler is aborting execution. +- `EXIT` - Emitted when the crawler is exiting. +- `CRAWLER_STATUS` - Provides status updates from crawlers. + +Additional specialized events for browser and session management are also available. + +The event manager operates as an async context manager, automatically starting periodic tasks when entered and ensuring all listeners complete before exiting. Event listeners can be either synchronous or asynchronous functions and are executed safely without blocking the main event loop. + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class EventManager { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class LocalEventManager + +class ApifyEventManager + +%% ======================== +%% Inheritance arrows +%% ======================== + +EventManager <|-- LocalEventManager +EventManager <|-- ApifyEventManager +``` + +## Session management + +The core component of session management in Crawlee is `SessionPool`. It manages a collection of sessions that simulate individual users with unique attributes like cookies, IP addresses (via proxies), and browser fingerprints. Sessions help avoid blocking by rotating user identities and maintaining realistic browsing patterns. + +:::info + +You can learn more about fingerprints and how to avoid getting blocked in the [Avoid blocking guide](./avoid-blocking). + +::: + +### Session + +A session is represented as a `Session` object, which contains components like cookies, error tracking, usage limits, and expiration handling. Sessions can be marked as good (`Session.mark_good`), bad (`Session.mark_bad`), or retired (`Session.retire`) based on their performance, and they automatically become unusable when they exceed error thresholds or usage limits. + +### Session pool + +The session pool provides automated session lifecycle management: + +- Automatic rotation - Retrieves random sessions from the pool and creates new ones as needed. +- Pool maintenance - Removes retired sessions and maintains the pool at maximum capacity. +- State persistence - Persists session state to enable recovery across restarts. +- Configurable limits - Supports custom pool sizes, session settings, and creation functions. + +The pool operates as an async context manager, automatically initializing with sessions and cleaning up on exit. It ensures proper session management by rotating sessions based on usage count, expiration time, and custom rules while maintaining optimal pool size. + +See the [Session management guide](./session-management) for more information. + +## Statistics + +The `Statistics` class provides runtime monitoring for crawler operations, tracking performance metrics like request counts, processing times, retry attempts, and error patterns. It operates as an async context manager, automatically persisting data across crawler restarts and migrations using `KeyValueStore`. + +The system includes error tracking through the `ErrorTracker` class, which groups similar errors by type and message patterns using wildcard matching. It can capture HTML snapshots and screenshots for debugging and separately track retry-specific errors. + +Statistics are logged at configurable intervals in both table and inline formats, with final summary data returned from the `crawler.run` method available through `FinalStatistics`. + +## Conclusion + +In this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](../api), and [Examples](../examples) for more details on how to use these components in your own projects. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/code_examples/http_crawlers/beautifulsoup_example.py b/docs/guides/code_examples/http_crawlers/beautifulsoup_example.py new file mode 100644 index 0000000000..49e6fde9ec --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/beautifulsoup_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + # Create a BeautifulSoupCrawler instance + crawler = BeautifulSoupCrawler( + # Limit the crawl to 10 requests + max_requests_per_crawl=10, + ) + + # Define the default request handler + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract data using BeautifulSoup + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push extracted data to the dataset + await context.push_data(data) + + # Enqueue links found on the page for further crawling + await context.enqueue_links() + + # Run the crawler + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/custom_crawler_example.py b/docs/guides/code_examples/http_crawlers/custom_crawler_example.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/http_crawlers/http_example.py b/docs/guides/code_examples/http_crawlers/http_example.py new file mode 100644 index 0000000000..a426a2ee23 --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/http_example.py @@ -0,0 +1,52 @@ +import asyncio +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + # Create an HttpCrawler instance - no automatic parsing + crawler = HttpCrawler( + # Limit the crawl to 10 requests + max_requests_per_crawl=10, + ) + + # Define the default request handler + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Get the raw response content + response_body = await context.http_response.read() + response_text = response_body.decode('utf-8') + + # Extract title manually using regex (since we don't have a parser) + title_match = re.search( + r']*>([^<]+)', response_text, re.IGNORECASE + ) + title = title_match.group(1).strip() if title_match else None + + # Extract basic information + data = { + 'url': context.request.url, + 'title': title, + } + + # Push extracted data to the dataset + await context.push_data(data) + + # Simple link extraction for further crawling + href_pattern = r'href=["\']([^"\']+)["\']' + matches = re.findall(href_pattern, response_text, re.IGNORECASE) + + # Enqueue first few links found (limit to avoid too many requests) + for href in matches[:3]: + if href.startswith('http') and 'crawlee.dev' in href: + await context.add_requests([href]) + + # Run the crawler + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/parsel_example.py b/docs/guides/code_examples/http_crawlers/parsel_example.py new file mode 100644 index 0000000000..a368317ba6 --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/parsel_example.py @@ -0,0 +1,35 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a ParselCrawler instance + crawler = ParselCrawler( + # Limit the crawl to 10 requests + max_requests_per_crawl=10, + ) + + # Define the default request handler + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract data using Parsel's XPath and CSS selectors + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset + await context.push_data(data) + + # Enqueue links found on the page for further crawling + await context.enqueue_links() + + # Run the crawler + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_loaders/tandem_example.py b/docs/guides/code_examples/request_loaders/rl_tandem_example.py similarity index 100% rename from docs/guides/code_examples/request_loaders/tandem_example.py rename to docs/guides/code_examples/request_loaders/rl_tandem_example.py diff --git a/docs/guides/code_examples/request_loaders/tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py similarity index 100% rename from docs/guides/code_examples/request_loaders/tandem_example_explicit.py rename to docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py new file mode 100644 index 0000000000..f43503eaf3 --- /dev/null +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py @@ -0,0 +1,37 @@ +import asyncio +import re + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import HttpxHttpClient +from crawlee.request_loaders import SitemapRequestLoader + + +async def main() -> None: + # Create an HTTP client for fetching sitemaps + async with HttpxHttpClient() as http_client: + # Create a sitemap request loader with URL filtering + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + # Include only URLs that contain 'docs' + include=[re.compile(r'.*docs.*')], + max_buffer_size=500, # Buffer up to 500 URLs in memory + ) + + # Convert the sitemap loader to a request manager using the to_tandem method. + # It is a tandem with the default request queue. + request_manager = await sitemap_loader.to_tandem() + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler(request_manager=request_manager) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py new file mode 100644 index 0000000000..46084f6828 --- /dev/null +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py @@ -0,0 +1,40 @@ +import asyncio +import re + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import HttpxHttpClient +from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Create an HTTP client for fetching sitemaps + async with HttpxHttpClient() as http_client: + # Create a sitemap request loader with URL filtering + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + # Include only URLs that contain 'docs' + include=[re.compile(r'.*docs.*')], + max_buffer_size=500, # Buffer up to 500 URLs in memory + ) + + # Open the default request queue. + request_queue = await RequestQueue.open() + + # And combine them together to a single request manager. + request_manager = RequestManagerTandem(sitemap_loader, request_queue) + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler(request_manager=request_manager) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py new file mode 100644 index 0000000000..fbe0d412bd --- /dev/null +++ b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py @@ -0,0 +1,57 @@ +import asyncio + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPreNavCrawlingContext, +) +from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( + AdaptiveContextError, +) + + +async def main() -> None: + crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + max_requests_per_crawl=5, + ) + + # Common pre-navigation hook (runs for all requests) + @crawler.pre_navigation_hook + async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + # This runs for both HTTP and browser requests + context.request.headers['Accept'] = 'text/html,application/xhtml+xml' + + # Playwright-specific pre-navigation hook (only when using browser) + @crawler.pre_navigation_hook(playwright_only=True) + async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: + # This only runs when browser is used + await context.page.set_viewport_size({'width': 1280, 'height': 720}) + if context.block_requests: + await context.block_requests(extra_url_patterns=['*.css', '*.js']) + + @crawler.router.default_handler + async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + try: + # Try browser-based extraction first + page = context.page + title = await page.title() + method = 'browser' + except AdaptiveContextError: + # Fallback to static parsing + title_tag = context.parsed_content.find('title') + title = title_tag.get_text() if title_tag else 'No title' + method = 'static' + + await context.push_data( + { + 'url': context.request.url, + 'title': title, + 'method': method, + } + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/advanced_routing.py b/docs/guides/code_examples/request_router/advanced_routing.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/request_router/basic_request_handlers.py b/docs/guides/code_examples/request_router/basic_request_handlers.py new file mode 100644 index 0000000000..4d67550fa4 --- /dev/null +++ b/docs/guides/code_examples/request_router/basic_request_handlers.py @@ -0,0 +1,92 @@ +import asyncio + +from crawlee import Request +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.router import Router + + +async def main() -> None: + # Create a custom router instance + router = Router[ParselCrawlingContext]() + + # Define the default handler (fallback for requests without specific labels) + @router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing homepage: {context.request.url}') + + # Extract page title + title = context.selector.css('title::text').get() or 'No title found' + + await context.push_data( + { + 'url': context.request.url, + 'title': title, + 'page_type': 'homepage', + } + ) + + # Find and enqueue collection/category links + await context.enqueue_links(selector='a[href*="/collections/"]', label='CATEGORY') + + # Define a handler for category pages + @router.handler('CATEGORY') + async def category_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing category page: {context.request.url}') + + # Extract category information + category_title = context.selector.css('h1::text').get() or 'Unknown Category' + product_count = len(context.selector.css('.product-item').getall()) + + await context.push_data( + { + 'url': context.request.url, + 'type': 'category', + 'category_title': category_title, + 'product_count': product_count, + 'handler': 'category', + } + ) + + # Enqueue product links from this category + await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT') + + # Define a handler for product detail pages + @router.handler('PRODUCT') + async def product_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing product page: {context.request.url}') + + # Extract detailed product information + product_data = { + 'url': context.request.url, + 'name': context.selector.css('h1::text').get(), + 'price': context.selector.css('.price::text').get(), + 'description': context.selector.css('.product-description p::text').get(), + 'images': context.selector.css('.product-gallery img::attr(src)').getall(), + 'in_stock': bool(context.selector.css('.add-to-cart-button').get()), + 'handler': 'product', + } + + await context.push_data(product_data) + + # Create crawler with the router + crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=20, + ) + + # Start crawling with some initial requests + await crawler.run( + [ + # Will use default handler + 'https://warehouse-theme-metal.myshopify.com/', + # Will use category handler + Request.from_url( + 'https://warehouse-theme-metal.myshopify.com/collections/all', + label='CATEGORY', + ), + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/builtin_router_approach.py b/docs/guides/code_examples/request_router/builtin_router_approach.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/request_router/custom_router_approach.py b/docs/guides/code_examples/request_router/custom_router_approach.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/request_router/custom_router_default_only.py b/docs/guides/code_examples/request_router/custom_router_default_only.py new file mode 100644 index 0000000000..5ace98cf1b --- /dev/null +++ b/docs/guides/code_examples/request_router/custom_router_default_only.py @@ -0,0 +1,44 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.router import Router + + +async def main() -> None: + # Create a custom router instance + router = Router[ParselCrawlingContext]() + + # Define only a default handler + @router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract page title + title = context.selector.css('title::text').get() or 'No title found' + + # Extract and save basic page data + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + # Find and enqueue product links for further crawling + await context.enqueue_links( + selector='a[href*="/products/"]', + label='PRODUCT', # Note: no handler for this label, will use default + ) + + # Create crawler with the custom router + crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=10, + ) + + # Start crawling + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/error_handler.py b/docs/guides/code_examples/request_router/error_handler.py new file mode 100644 index 0000000000..40b7e99fa5 --- /dev/null +++ b/docs/guides/code_examples/request_router/error_handler.py @@ -0,0 +1,60 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext +from crawlee.errors import HttpStatusCodeError + +# HTTP status code constants +TOO_MANY_REQUESTS = 429 + + +async def main() -> None: + # Create a crawler instance + crawler = ParselCrawler(max_requests_per_crawl=10) + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract product information (might fail for some pages) + product_name = context.selector.css('h1[data-testid="product-title"]::text').get() + if not product_name: + raise ValueError('Product name not found - might be a non-product page') + + price = context.selector.css('.price::text').get() + await context.push_data( + { + 'url': context.request.url, + 'product_name': product_name, + 'price': price, + } + ) + + # Error handler - called when an error occurs during request processing + @crawler.error_handler + async def error_handler(context: BasicCrawlingContext, error: Exception) -> None: + error_name = type(error).__name__ + context.log.warning(f'Error occurred for {context.request.url}: {error_name}') + + # You can modify the request or context here before retry + if ( + isinstance(error, HttpStatusCodeError) + and error.status_code == TOO_MANY_REQUESTS + ): + context.log.info('Rate limited - will retry with delay') + # You could modify headers, add delay, etc. + elif isinstance(error, ValueError): + context.log.info('Parse error - marking request as no retry') + context.request.no_retry = True + + # Start crawling + await crawler.run( + [ + 'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens', + # Might cause parse error + 'https://warehouse-theme-metal.myshopify.com/collections/mens-running', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/error_handlers.py b/docs/guides/code_examples/request_router/error_handlers.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/request_router/failed_request_handler.py b/docs/guides/code_examples/request_router/failed_request_handler.py new file mode 100644 index 0000000000..0f1634c1b1 --- /dev/null +++ b/docs/guides/code_examples/request_router/failed_request_handler.py @@ -0,0 +1,64 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a crawler instance with retry settings + crawler = ParselCrawler( + max_requests_per_crawl=10, + max_request_retries=2, # Allow 2 retries before failing + ) + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract product information + product_name = context.selector.css('h1[data-testid="product-title"]::text').get() + if not product_name: + product_name = context.selector.css('h1::text').get() or 'Unknown Product' + + price = context.selector.css('.price::text').get() or 'Price not available' + + await context.push_data( + { + 'url': context.request.url, + 'product_name': product_name, + 'price': price, + 'status': 'success', + } + ) + + # Failed request handler - called when request has exhausted all retries + @crawler.failed_request_handler + async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error( + f'Failed to process {context.request.url} after all retries: {error}' + ) + + # Save failed request information for analysis + await context.push_data( + { + 'failed_url': context.request.url, + 'label': context.request.label, + 'error_type': type(error).__name__, + 'error_message': str(error), + 'retry_count': context.request.retry_count, + 'status': 'failed', + } + ) + + # Start crawling with some URLs that might fail + await crawler.run( + [ + 'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens', + # This will likely fail + 'https://warehouse-theme-metal.myshopify.com/invalid-url', + 'https://warehouse-theme-metal.myshopify.com/products/valid-product', + ] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/http_pre_navigation.py b/docs/guides/code_examples/request_router/http_pre_navigation.py new file mode 100644 index 0000000000..05db6eff15 --- /dev/null +++ b/docs/guides/code_examples/request_router/http_pre_navigation.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + crawler = ParselCrawler() + + @crawler.pre_navigation_hook + async def setup_request(context: BasicCrawlingContext) -> None: + # Add custom headers before making the request + context.request.headers['User-Agent'] = 'Crawlee Bot 1.0' + context.request.headers['Accept'] = 'text/html,application/xhtml+xml' + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + # Extract basic page information + title = context.selector.css('title::text').get() + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/playwright_pre_navigation.py b/docs/guides/code_examples/request_router/playwright_pre_navigation.py new file mode 100644 index 0000000000..7940cc072c --- /dev/null +++ b/docs/guides/code_examples/request_router/playwright_pre_navigation.py @@ -0,0 +1,62 @@ +import asyncio + +from crawlee.crawlers import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +async def main() -> None: + crawler = PlaywrightCrawler(max_requests_per_crawl=5) + + @crawler.pre_navigation_hook + async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None: + # Set viewport size for consistent rendering + await context.page.set_viewport_size({'width': 1280, 'height': 720}) + + # Block unnecessary resources to speed up crawling + await context.block_requests( + extra_url_patterns=[ + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.svg', + '*.css', + '*.woff', + '*.woff2', + '*.ttf', + '*google-analytics*', + '*facebook*', + '*twitter*', + ] + ) + + # Set custom user agent + await context.page.set_extra_http_headers( + { + 'User-Agent': 'Mozilla/5.0 (compatible; Crawlee Bot)', + } + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + # Wait for page to load + await context.page.wait_for_load_state('networkidle') + + # Extract page title + title = await context.page.title() + + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_router/simple_default_handler.py b/docs/guides/code_examples/request_router/simple_default_handler.py new file mode 100644 index 0000000000..e055491844 --- /dev/null +++ b/docs/guides/code_examples/request_router/simple_default_handler.py @@ -0,0 +1,34 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a crawler instance + crawler = ParselCrawler(max_requests_per_crawl=10) + + # Use the crawler's built-in router to define a default handler + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Extract page title + title = context.selector.css('title::text').get() or 'No title found' + + # Extract and save basic page data + await context.push_data( + { + 'url': context.request.url, + 'title': title, + } + ) + + # Find and enqueue product links for further crawling + await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT') + + # Start crawling + await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_conflicts.py b/docs/guides/code_examples/service_locator/service_conflicts.py new file mode 100644 index 0000000000..52bcbbe8e9 --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_conflicts.py @@ -0,0 +1,22 @@ +import asyncio + +from crawlee import service_locator +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient + + +async def main() -> None: + # Register the storage client via service locator. + memory_storage_client = MemoryStorageClient() + service_locator.set_storage_client(memory_storage_client) + + # Retrieve the storage client. + current_storage_client = service_locator.get_storage_client() + + # Try to set a different storage client, which will raise ServiceConflictError + # if storage client was already retrieved. + file_system_storage_client = FileSystemStorageClient() + service_locator.set_storage_client(file_system_storage_client) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_crawler_configuration.py b/docs/guides/code_examples/service_locator/service_crawler_configuration.py new file mode 100644 index 0000000000..50b13fee71 --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_crawler_configuration.py @@ -0,0 +1,22 @@ +import asyncio +from datetime import timedelta + +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler + + +async def main() -> None: + configuration = Configuration( + log_level='DEBUG', + headless=False, + persist_state_interval=timedelta(seconds=30), + ) + + # Register configuration via crawler. + crawler = ParselCrawler( + configuration=configuration, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_crawler_event_manager.py b/docs/guides/code_examples/service_locator/service_crawler_event_manager.py new file mode 100644 index 0000000000..e8a82f4f0e --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_crawler_event_manager.py @@ -0,0 +1,20 @@ +import asyncio +from datetime import timedelta + +from crawlee.crawlers import ParselCrawler +from crawlee.events import LocalEventManager + + +async def main() -> None: + event_manager = LocalEventManager( + system_info_interval=timedelta(seconds=5), + ) + + # Register event manager via crawler. + crawler = ParselCrawler( + event_manager=event_manager, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_crawler_storage_client.py b/docs/guides/code_examples/service_locator/service_crawler_storage_client.py new file mode 100644 index 0000000000..76fe923877 --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_crawler_storage_client.py @@ -0,0 +1,17 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + storage_client = MemoryStorageClient() + + # Register storage client via crawler. + crawler = ParselCrawler( + storage_client=storage_client, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_locator_configuration.py b/docs/guides/code_examples/service_locator/service_locator_configuration.py new file mode 100644 index 0000000000..bb3f429eed --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_locator_configuration.py @@ -0,0 +1,20 @@ +import asyncio +from datetime import timedelta + +from crawlee import service_locator +from crawlee.configuration import Configuration + + +async def main() -> None: + configuration = Configuration( + log_level='DEBUG', + headless=False, + persist_state_interval=timedelta(seconds=30), + ) + + # Register configuration via service locator. + service_locator.set_configuration(configuration) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_locator_event_manager.py b/docs/guides/code_examples/service_locator/service_locator_event_manager.py new file mode 100644 index 0000000000..3d98a8cf55 --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_locator_event_manager.py @@ -0,0 +1,18 @@ +import asyncio +from datetime import timedelta + +from crawlee import service_locator +from crawlee.events import LocalEventManager + + +async def main() -> None: + event_manager = LocalEventManager( + system_info_interval=timedelta(seconds=5), + ) + + # Register event manager via service locator. + service_locator.set_event_manager(event_manager) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_locator_storage_client.py b/docs/guides/code_examples/service_locator/service_locator_storage_client.py new file mode 100644 index 0000000000..4dcad08420 --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_locator_storage_client.py @@ -0,0 +1,15 @@ +import asyncio + +from crawlee import service_locator +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + storage_client = MemoryStorageClient() + + # Register storage client via service locator. + service_locator.set_storage_client(storage_client) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_storage_configuration.py b/docs/guides/code_examples/service_locator/service_storage_configuration.py new file mode 100644 index 0000000000..4c7370b77b --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_storage_configuration.py @@ -0,0 +1,22 @@ +import asyncio +from datetime import timedelta + +from crawlee.configuration import Configuration +from crawlee.storages import Dataset + + +async def main() -> None: + configuration = Configuration( + log_level='DEBUG', + headless=False, + persist_state_interval=timedelta(seconds=30), + ) + + # Pass the configuration to the dataset (or other storage) when opening it. + dataset = await Dataset.open( + configuration=configuration, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/service_locator/service_storage_storage_client.py b/docs/guides/code_examples/service_locator/service_storage_storage_client.py new file mode 100644 index 0000000000..02a0853d44 --- /dev/null +++ b/docs/guides/code_examples/service_locator/service_storage_storage_client.py @@ -0,0 +1,17 @@ +import asyncio + +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + storage_client = MemoryStorageClient() + + # Pass the storage client to the dataset (or other storage) when opening it. + dataset = await Dataset.open( + storage_client=storage_client, + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/storage_clients/registering_storage_client_example.py b/docs/guides/code_examples/storage_clients/registering_storage_clients_example.py similarity index 100% rename from docs/guides/code_examples/storage_clients/registering_storage_client_example.py rename to docs/guides/code_examples/storage_clients/registering_storage_clients_example.py diff --git a/docs/guides/http_clients.mdx b/docs/guides/http_clients.mdx index 2d79dabf8d..eaa18f9ae4 100644 --- a/docs/guides/http_clients.mdx +++ b/docs/guides/http_clients.mdx @@ -1,7 +1,7 @@ --- id: http-clients title: HTTP clients -description: Crawlee supports multiple HTTP clients when making requests. +description: Learn about Crawlee's HTTP client architecture, how to switch between different implementations, and create custom HTTP clients for specialized web scraping needs. --- import ApiLink from '@site/src/components/ApiLink'; @@ -12,11 +12,49 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BsCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/curl_impersonate_example.py'; import BsHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/httpx_example.py'; -HTTP clients are utilized by the HTTP-based crawlers (e.g. `BeautifulSoupCrawler`) to communicate with web servers. They use external HTTP libraries for communication, rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/) or [curl-cffi](https://pypi.org/project/curl-cffi/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries are [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/) or [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but they cannot execute client-side JavaScript. +HTTP clients are utilized by HTTP-based crawlers (e.g., `ParselCrawler` and `BeautifulSoupCrawler`) to communicate with web servers. They use external HTTP libraries for communication rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/), [curl-cffi](https://pypi.org/project/curl-cffi/), and [impit](https://apify.github.io/impit/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries include [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/), and [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but cannot execute client-side JavaScript. -## How to switch between HTTP clients +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class HttpClient { + <> +} + +%% ======================== +%% Specific classes +%% ======================== -In Crawlee we currently have two HTTP clients: `HttpxHttpClient`, which uses the `httpx` library, and `CurlImpersonateHttpClient`, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter in the Crawler class. The default HTTP client is `HttpxHttpClient`. Below are examples of how to set the HTTP client for the `BeautifulSoupCrawler`. +class HttpxHttpClient + +class CurlImpersonateHttpClient + +class ImpitHttpClient + +%% ======================== +%% Inheritance arrows +%% ======================== + +HttpClient <|-- HttpxHttpClient +HttpClient <|-- CurlImpersonateHttpClient +HttpClient <|-- ImpitHttpClient +``` + +## Switching between HTTP clients + +Crawlee currently provides two main HTTP clients: `HttpxHttpClient`, which uses the `httpx` library, and `CurlImpersonateHttpClient`, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is `HttpxHttpClient`. + +Below are examples of how to configure the HTTP client for the `BeautifulSoupCrawler`: @@ -24,27 +62,45 @@ In Crawlee we currently have two HTTP clients: + {BsCurlImpersonateExample} -### Installation +## Installation requirements -Since `HttpxHttpClient` is the default HTTP client, you don't need to install additional packages to use it. If you want to use `CurlImpersonateHttpClient`, you need to install `crawlee` with the `curl-impersonate` extra. +Since `HttpxHttpClient` is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages. + +For `CurlImpersonateHttpClient`, you need to install Crawlee with the `curl-impersonate` extra: ```sh python -m pip install 'crawlee[curl-impersonate]' ``` -or install all available extras: +Alternatively, you can install all available extras to get access to all HTTP clients and features: ```sh python -m pip install 'crawlee[all]' ``` -## How HTTP clients work +## Creating custom HTTP clients + +Crawlee provides an abstract base class, `HttpClient`, which defines the interface that all HTTP clients must implement. This allows you to create custom HTTP clients tailored to your specific requirements. + +HTTP clients are responsible for several key operations: + +- sending HTTP requests and receiving responses, +- managing cookies and sessions, +- handling headers and authentication, +- managing proxy configurations, +- connection pooling with timeout management. + +To create a custom HTTP client, you need to inherit from the `HttpClient` base class and implement all required abstract methods. Your implementation must be async-compatible and include proper cleanup and resource management to work seamlessly with Crawlee's concurrent processing model. + +## Conclusion + +This guide introduced you to the HTTP clients available in Crawlee and demonstrated how to switch between them, including their installation requirements and usage examples. You also learned about the responsibilities of HTTP clients and how to implement your own custom HTTP client by inheriting from the `HttpClient` base class. -We provide an abstract base class, `HttpClient`, which defines the necessary interface for all HTTP clients. HTTP clients are responsible for sending requests and receiving responses, as well as managing cookies, headers, and proxies. They provide methods that are called from crawlers. To implement your own HTTP client, inherit from the `HttpClient` class and implement the required methods. +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 42541fe456..a533f8b0cb 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -1,38 +1,101 @@ --- id: http-crawlers title: HTTP crawlers -description: Crawlee supports multiple HTTP crawlers that can be used to extract data from server-rendered webpages. +description: Learn about Crawlee's HTTP crawlers including BeautifulSoup, Parsel, and raw HTTP crawlers for efficient server-rendered content extraction without JavaScript execution. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -Generic class `AbstractHttpCrawler` is parent to `BeautifulSoupCrawler`, `ParselCrawler` and `HttpCrawler` and it could be used as parent for your crawler with custom content parsing requirements. +import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py'; +import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py'; +import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py'; -It already includes almost all the functionality to crawl webpages and the only missing part is the parser that should be used to parse HTTP responses, and a context dataclass that defines what context helpers will be available to user handler functions. +HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead. -## `BeautifulSoupCrawler` +## Overview -`BeautifulSoupCrawler` uses `BeautifulSoupParser` to parse the HTTP response and makes it available in `BeautifulSoupCrawlingContext` in the `.soup` or `.parsed_content` attribute. +All HTTP crawlers share a common architecture built around the `AbstractHttpCrawler` base class. The main differences lie in the parsing strategy and the context provided to request handlers. There are `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`. It can also be extended to create custom crawlers with specialized parsing requirements. They use HTTP clients to fetch page content and parsing libraries to extract data from the HTML, check out the [HTTP clients guide](./http-clients) to learn about the HTTP clients used by these crawlers, how to switch between them, and how to create custom HTTP clients tailored to your specific requirements. -## `ParselCrawler` +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class BasicCrawler { + <> +} + +class AbstractHttpCrawler { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class HttpCrawler + +class ParselCrawler + +class BeautifulSoupCrawler + +%% ======================== +%% Inheritance arrows +%% ======================== + +BasicCrawler <|-- AbstractHttpCrawler +AbstractHttpCrawler <|-- HttpCrawler +AbstractHttpCrawler <|-- ParselCrawler +AbstractHttpCrawler <|-- BeautifulSoupCrawler +``` + +## BeautifulSoupCrawler + +The `BeautifulSoupCrawler` uses the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library for HTML parsing. It provides fault-tolerant parsing that handles malformed HTML, automatic character encoding detection, and supports CSS selectors, tag navigation, and custom search functions. Use this crawler when working with imperfect HTML structures, when you prefer BeautifulSoup's intuitive API, or when prototyping web scraping solutions. + + + {BeautifulSoupExample} + + +## ParselCrawler + +The `ParselCrawler` uses the [Parsel](https://parsel.readthedocs.io/) library, which provides XPath 1.0 and CSS selector support built on `lxml` for high performance. It includes built-in regex support for pattern matching, proper XML namespace handling, and offers better performance than BeautifulSoup while maintaining a clean API. Use this crawler when you need XPath functionality, require high-performance parsing, or need to extract data using regular expressions. + + + {ParselExample} + + +## HttpCrawler -`ParselCrawler` uses `ParselParser` to parse the HTTP response and makes it available in `ParselCrawlingContext` in the `.selector` or `.parsed_content` attribute. +The `HttpCrawler` provides direct access to HTTP response body and headers without automatic parsing, offering maximum performance with no parsing overhead. It supports any content type (JSON, XML, binary) and allows complete control over response processing, including memory-efficient handling of large responses. Use this crawler when working with non-HTML content, requiring maximum performance, implementing custom parsing logic, or needing access to raw response data. -## `HttpCrawler` + + {HttpExample} + -`HttpCrawler` uses `NoParser` that does not parse the HTTP response at all and is to be used if no parsing is required. +## Creating custom HTTP crawler -## Creating your own HTTP crawler +While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing: -### Why? +1. **Custom parser class**: Inherit from `AbstractHttpParser`. +2. **Custom context class**: Define what data and helpers are available to handlers. +3. **Custom crawler class**: Tie everything together. -In case you want to use some custom parser for parsing HTTP responses, and the rest of the `AbstractHttpCrawler` functionality suit your needs. +This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format. -### How? +## Conclusion -You need to define at least 2 new classes and decide what will be the type returned by the parser's `parse` method. -Parser will inherit from `AbstractHttpParser` and it will need to implement all it's abstract methods. Crawler will inherit from `AbstractHttpCrawler` and it will need to implement all it's abstract methods. Newly defined parser is then used in the `parser` argument of `AbstractHttpCrawler.__init__` method. +This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to create custom crawlers for specific use cases. -To get better idea and as an example please see one of our own HTTP-based crawlers mentioned above. +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/playwright_crawler.mdx b/docs/guides/playwright_crawler.mdx index 124f09e8ad..6232c16ddb 100644 --- a/docs/guides/playwright_crawler.mdx +++ b/docs/guides/playwright_crawler.mdx @@ -1,7 +1,7 @@ --- id: playwright-crawler title: Playwright crawler -description: How to use the PlaywrightCrawler and its related components. +description: Learn how to use PlaywrightCrawler for browser-based web scraping. --- import ApiLink from '@site/src/components/ApiLink'; diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx index 696bc15163..7957b98015 100644 --- a/docs/guides/playwright_crawler_adaptive.mdx +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -1,7 +1,7 @@ --- id: adaptive-playwright-crawler -title: AdaptivePlaywrightCrawler -description: How to use the AdaptivePlaywrightCrawler. +title: Adaptive Playwright crawler +description: Learn how to use the Adaptive Playwright crawler to automatically switch between browser-based and HTTP-only crawling. --- import ApiLink from '@site/src/components/ApiLink'; diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx index ccc622bca8..31b3a0f6af 100644 --- a/docs/guides/request_loaders.mdx +++ b/docs/guides/request_loaders.mdx @@ -11,10 +11,12 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py'; -import TandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example.py'; -import ExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example_explicit.py'; +import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py'; +import TlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py'; +import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py'; +import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py'; -The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the `RequestQueue`, providing additional tools for managing URLs. If you are new to Crawlee, and you do not know the `RequestQueue`, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases, such as reading URLs from files, external APIs or combining multiple sources together. +The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the `RequestQueue`, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the `RequestQueue`, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together. ## Overview @@ -24,10 +26,10 @@ The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/ - `RequestManager`: Extends `RequestLoader` with write capabilities. - `RequestManagerTandem`: Combines a read-only `RequestLoader` with a writable `RequestManager`. -And specific request loaders: +And specific request loader implementations: -- `RequestList`: A lightweight implementation of request loader for managing a static list of URLs. -- `SitemapRequestLoader`: A request loader that reads URLs from XML sitemaps with filtering capabilities. +- `RequestList`: A lightweight implementation for managing a static list of URLs. +- `SitemapRequestLoader`: A specialized loader that reads URLs from XML sitemaps with filtering capabilities. Below is a class diagram that illustrates the relationships between these components and the `RequestQueue`: @@ -75,25 +77,13 @@ class RequestManager { %% Specific classes %% ======================== -class RequestQueue { - _attributes_ - _methods_() -} +class RequestQueue -class RequestList { - _attributes_ - _methods_() -} +class RequestList -class SitemapRequestLoader { - _attributes_ - _methods_() -} +class SitemapRequestLoader -class RequestManagerTandem { - _attributes_ - _methods_() -} +class RequestManagerTandem %% ======================== %% Inheritance arrows @@ -108,11 +98,13 @@ RequestLoader <|-- SitemapRequestLoader RequestManager <|-- RequestManagerTandem ``` -## Request loader +## Request loaders + +The `RequestLoader` interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as `RequestList`, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the `RequestLoader` API reference. -The `RequestLoader` interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, or checking the status of requests. Concrete implementations, such as `RequestList`, build on this interface to handle specific scenarios. You may create your own loader that reads from an external file, a web endpoint, a database or matches some other specific scenario. For more details refer to the `RequestLoader` API reference. +### Request list -The `RequestList` can accept an asynchronous generator as input. This allows the requests to be streamed, rather than loading them all into memory at once. This can significantly reduce the memory usage, especially when working with large sets of URLs. +The `RequestList` can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs. Here is a basic example of working with the `RequestList`: @@ -120,39 +112,60 @@ Here is a basic example of working with the `Req {RlBasicExample} -## Sitemap request loader +### Sitemap request loader -The `SitemapRequestLoader` is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The `SitemapRequestLoader` provides streaming processing of sitemaps, which ensures efficient memory usage without loading the entire sitemap into memory. +The `SitemapRequestLoader` is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The `SitemapRequestLoader` provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory. {SitemapExample} -## Request manager +## Request managers -The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add or reclaim them. This is important for dynamic crawling projects, where new URLs may emerge during the crawl process. Or when certain requests may failed and need to be retried. For more details refer to the `RequestManager` API reference. +The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the `RequestManager` API reference. ## Request manager tandem -The `RequestManagerTandem` class allows you to combine the read-only capabilities `RequestLoader` (like `RequestList`) with read-write capabilities of a `RequestManager` (like `RequestQueue`). This is useful for scenarios where you need to load initial requests from a static source (like a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times. Under the hood, `RequestManagerTandem` checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side. +The `RequestManagerTandem` class allows you to combine the read-only capabilities of a `RequestLoader` (like `RequestList`) with the read-write capabilities of a `RequestManager` (like `RequestQueue`). This is useful for scenarios where you need to load initial requests from a static source (such as a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times. + +Under the hood, `RequestManagerTandem` checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side. ### Request list with request queue -This sections describes the combination of the `RequestList` and `RequestQueue` classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but you also need to handle dynamic requests during the crawl process. The `RequestManagerTandem` class facilitates this combination, with the `RequestLoader.to_tandem` method available as a convenient shortcut. Requests from the `RequestList` are processed first by enqueuing them into the default `RequestQueue`, which handles persistence and retries failed requests. +This section describes the combination of the `RequestList` and `RequestQueue` classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but also need to handle dynamic requests discovered during the crawl process. The `RequestManagerTandem` class facilitates this combination, with the `RequestLoader.to_tandem` method available as a convenient shortcut. Requests from the `RequestList` are processed first by being enqueued into the default `RequestQueue`, which handles persistence and retries for failed requests. - + - {ExplicitTandemExample} + {TlExplicitTandemExample} - {TandemExample} + {RlTandemExample} + + + + +### Sitemap request loader with request queue + +Similar to the `RequestList` example above, you can combine a `SitemapRequestLoader` with a `RequestQueue` using the `RequestManagerTandem` class. This setup is particularly useful when you want to crawl URLs from a sitemap while also handling dynamic requests discovered during the crawl process. URLs from the sitemap are processed first by being enqueued into the default `RequestQueue`, which handles persistence and retries for failed requests. + + + + + {SitemapExplicitTandemExample} + + + + + {SitemapTandemExample} ## Conclusion -This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` classes. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs and requests. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` implementations. You also saw practical examples of how to work with these classes to handle various crawling scenarios. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/request_router.mdx b/docs/guides/request_router.mdx new file mode 100644 index 0000000000..7f16232693 --- /dev/null +++ b/docs/guides/request_router.mdx @@ -0,0 +1,110 @@ +--- +id: request-router +title: Request router +description: Learn how to use the Router class to organize request handlers, error handlers, and pre-navigation hooks in Crawlee. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import BasicRequestHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/basic_request_handlers.py'; +import SimpleDefaultHandler from '!!raw-loader!roa-loader!./code_examples/request_router/simple_default_handler.py'; +import CustomRouterDefaultOnly from '!!raw-loader!roa-loader!./code_examples/request_router/custom_router_default_only.py'; +import HttpPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/http_pre_navigation.py'; +import ErrorHandler from '!!raw-loader!roa-loader!./code_examples/request_router/error_handler.py'; +import FailedRequestHandler from '!!raw-loader!roa-loader!./code_examples/request_router/failed_request_handler.py'; +import PlaywrightPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/playwright_pre_navigation.py'; +import AdaptiveCrawlerHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/adaptive_crawler_handlers.py'; + +The `Router` class manages request flow and coordinates the execution of user-defined logic in Crawlee projects. It routes incoming requests to appropriate user-defined handlers based on labels, manages error scenarios, and provides hooks for pre-navigation execution. The `Router` serves as the orchestrator for all crawling operations, ensuring that each request is processed by the correct handler according to its type and label. + +## Request handlers + +Request handlers are user-defined functions that process individual requests and their corresponding responses. Each handler receives a crawling context as its primary argument, which provides access to the current request, response data, and utility methods for data extraction, link enqueuing, and storage operations. Handlers determine how different types of pages are processed and how data is extracted and stored. + +:::note + +The code examples in this guide use `ParselCrawler` for demonstration, but the `Router` works with all crawler types. + +::: + +### Built-in router + +Every crawler instance includes a built-in `Router` accessible through the `crawler.router` property. This approach simplifies initial setup and covers basic use cases where request routing requirements are straightforward. + + + {SimpleDefaultHandler} + + +The default handler processes all requests that either lack a label or have a label for which no specific handler has been registered. + +### Custom router + +Applications requiring explicit control over router configuration or router reuse across multiple crawler instances can create custom `Router` instances. Custom routers provide complete control over request routing configuration and enable modular application architecture. Router instances can be configured independently and attached to your crawler instances as needed. + + + {CustomRouterDefaultOnly} + + +### Advanced routing by labels + +More complex crawling projects often require different processing logic for various page types. The router supports label-based routing, which allows registration of specialized handlers for specific content categories. This pattern enables clean separation of concerns and targeted processing logic for different URL patterns or content types. + + + {BasicRequestHandlers} + + +## Error handlers + +Crawlee provides error handling mechanisms to manage request processing failures. It distinguishes between recoverable errors that may succeed on retry and permanent failures that require alternative handling strategies. + +### Error handler + +The error handler executes when exceptions occur during request processing, before any retry attempts. This handler receives the error context and can implement custom recovery logic, modify request parameters, or determine whether the request should be retried. Error handlers enable control over failure scenarios and allow applications to implement error recovery strategies. + + + {ErrorHandler} + + +### Failed request handler + +The failed request handler executes when a request has exhausted all retry attempts and is considered permanently failed. This handler serves as the final opportunity to log failures, store failed requests for later analysis, create alternative requests, or implement fallback processing strategies. + + + {FailedRequestHandler} + + +## Pre-navigation hooks + +Pre-navigation hooks execute before each request is processed, providing opportunities to configure request parameters, modify browser settings, or implement request-specific optimizations. You can use pre-navigation hooks for example for viewport configuration, resource blocking, timeout management, header customization, custom proxy rotation, and request interception. + +### HTTP crawler + +HTTP crawlers support pre-navigation hooks that execute before making HTTP requests. These hooks enable request modification, header configuration, and other HTTP-specific optimizations. + + + {HttpPreNavigation} + + +### Playwright crawler + +Playwright crawlers provide extensive pre-navigation capabilities that allow browser page configuration before navigation. These hooks can modify browser behavior and configure page settings. + + + {PlaywrightPreNavigation} + + +### Adaptive Playwright crawler + +The `AdaptivePlaywrightCrawler` implements a dual-hook system with common hooks that execute for all requests and Playwright-specific hooks that execute only when browser automation is required. This is perfect for projects that need both static and dynamic content handling. + + + {AdaptiveCrawlerHandlers} + + +## Conclusion + +This guide introduced you to the `Router` class and how to organize your crawling logic. You learned how to use built-in and custom routers, implement request handlers with label-based routing, handle errors with error and failed request handlers, and configure pre-navigation hooks for different crawler types. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/service_locator.mdx b/docs/guides/service_locator.mdx new file mode 100644 index 0000000000..aa046b3bba --- /dev/null +++ b/docs/guides/service_locator.mdx @@ -0,0 +1,136 @@ +--- +id: service-locator +title: Service locator +description: Crawlee's service locator is a central registry for global services, managing and providing access to them throughout the whole framework. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ServiceLocatorConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_configuration.py'; +import ServiceLocatorStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_storage_client.py'; +import ServiceLocatorEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_event_manager.py'; + +import ServiceCrawlerConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_configuration.py'; +import ServiceCrawlerStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_storage_client.py'; +import ServiceCrawlerEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_event_manager.py'; + +import ServiceStorageConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_configuration.py'; +import ServiceStorageStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_storage_client.py'; + +import ServiceConflicts from '!!raw-loader!roa-loader!./code_examples/service_locator/service_conflicts.py'; + +The `ServiceLocator` is a central registry for global services. It manages and provides access to these services throughout the framework, ensuring their consistent configuration and across all components. + +The service locator manages three core services: `Configuration`, `EventManager`, and `StorageClient`. All services are initialized lazily with defaults when first accessed. + +## Services + +There are three core services that are managed by the service locator: + +### Configuration + +`Configuration` is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistance intervals, and various other settings. The configuration can be set directly in the code or via environment variables. + +### StorageClient + +`StorageClient` is the backend implementation for storages in Crawlee. It provides a unified interface for `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. Storage clients were already explained in the storage clients section. + +Refer to the [Storage clients guide](./storage-clients) for more information about storage clients and how to use them. + +### EventManager + +`EventManager` is responsible for coordinating internal events in Crawlee. It allows you to register event listeners and emit events throughout the framework. Examples of such events aborting, migrating, system info, or browser-specific events like page created, page closed and more. It provides a way to listen to events and execute custom logic when certain events occur. + +## Service registration + +There are several ways to register services in Crawlee, depending on your use case and preferences. + +### Via service locator + +Services can be registered globally through the `ServiceLocator` before they are first accessed. There is a singleton `service_locator` instance that is used throughout the framework, making the services available to all components throughout the whole framework. + + + + + + {ServiceLocatorStorageClient} + + + + + + {ServiceLocatorConfiguration} + + + + + + {ServiceLocatorEventManager} + + + + + +### Via crawler constructors + +Alternatively services can be passed to the crawler constructors. They will be registered globally to the `ServiceLocator` under the hood, making them available to all components and reaching consistent configuration. + + + + + + {ServiceCrawlerStorageClient} + + + + + + {ServiceCrawlerConfiguration} + + + + + + {ServiceCrawlerEventManager} + + + + + +### Via storage constructors + +Alternatively, services can be provided when opening specific storage instances, which uses them only for that particular instance without affecting global configuration. + + + + + + {ServiceStorageStorageClient} + + + + + + {ServiceStorageConfiguration} + + + + + +## Conflict prevention + +Once a service has been retrieved from the service locator, attempting to set a different instance will raise a `ServiceConflictError` to prevent accidental configuration conflicts. + + + {ServiceConflicts} + + +## Conclusion + +The `ServiceLocator` is a tool for managing global services in Crawlee. It provides a consistent way to configure and access services throughout the framework, ensuring that all components have access to the same configuration and services. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 6175eb2785..febf5933d9 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -13,38 +13,69 @@ import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_exam import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py'; import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py'; import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py'; +import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py'; -Storage clients in Crawlee are subclasses of `StorageClient`. They handle interactions with different storage backends. For instance: +Storage clients provide a unified interface for interacting with `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups. -- `MemoryStorageClient`: Stores data purely in memory with no persistence. -- `FileSystemStorageClient`: Provides persistent file system storage with in-memory caching for better performance. -- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com). Apify storage client is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You will find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction). +## Built-in storage clients -Each storage client is responsible for maintaining the storages in a specific environment. This abstraction makes it easier to switch between different environments, e.g. between local development and cloud production setup. +Crawlee provides three main storage client implementations: -Storage clients provide a unified interface for interacting with `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. +- `MemoryStorageClient`: Stores data in memory with no persistence +- `FileSystemStorageClient`: Provides persistent file system storage with in-memory caching +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python) -## Built-in storage clients +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- -Crawlee for Python currently provides two main storage client implementations: +classDiagram -### Memory storage client +%% ======================== +%% Abstract classes +%% ======================== -The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. +class StorageClient { + <> +} - -{MemoryStorageClientBasicExample} - +%% ======================== +%% Specific classes +%% ======================== -The `MemoryStorageClient` is a good choice for testing, development, short-lived operations where speed is more important than data persistence, or HTTP APIs where each request should be handled with a fresh storage. It is not suitable for production use or long-running crawls, as all data will be lost when the program exits. +class MemoryStorageClient + +class FileSystemStorageClient + +class ApifyStorageClient + +%% ======================== +%% Inheritance arrows +%% ======================== + +StorageClient <|-- MemoryStorageClient +StorageClient <|-- FileSystemStorageClient +StorageClient <|-- ApifyStorageClient +``` + +### Memory storage client + +The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is suitable for testing, development, short-lived operations where speed is prioritized over persistence. It is not recommended for production use or long-running crawls. :::warning Persistence limitation The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. ::: + +{MemoryStorageClientBasicExample} + + ### File system storage client -The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses smart caching and batch processing for better performance while storing data in human-readable JSON format. This is a default storage client used by Crawlee when no other storage client is specified. +The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses intelligent caching and batch processing for better performance while storing data in human-readable JSON format. This is the default storage client used by Crawlee when no other storage client is specified, making it ideal for large datasets and long-running operations where data persistence is required. :::warning Concurrency limitation The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. @@ -56,11 +87,12 @@ This storage client is ideal for large datasets, and long-running operations whe {FileSystemStorageClientBasicExample} -Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class. - - **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`): The root directory for all storage data. - - **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`): Whether to purge default storages on start. +Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class: + +- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`): The root directory for all storage data +- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`): Whether to purge default storages on start -Data are stored using the following directory structure: +Data is stored using the following directory structure: ```text {CRAWLEE_STORAGE_DIR}/ @@ -83,9 +115,9 @@ Data are stored using the following directory structure: ``` Where: -- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage. -- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`). -- Files are stored directly without additional metadata files for simpler structure. +- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage +- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`) +- Files are stored directly without additional metadata files for simpler structure Here is an example of how to configure the `FileSystemStorageClient`: @@ -107,35 +139,18 @@ Custom storage clients can implement any storage logic, such as connecting to a ## Registering storage clients -Storage clients can be registered either: -- Globally, with the `ServiceLocator` or passed directly to the crawler; -- Or storage specific, when opening a storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue`. - -```python -from crawlee.storage_clients import CustomStorageClient -from crawlee.service_locator import service_locator -from crawlee.crawlers import ParselCrawler -from crawlee.storages import Dataset +Storage clients can be registered in multiple ways: +- **Globally**: Using the `ServiceLocator` or passing directly to the crawler +- **Per storage**: When opening a specific storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue` -# Create custom storage client. -storage_client = CustomStorageClient() -storage_client = CustomStorageClient() - -# Register it either with the service locator. -service_locator.set_storage_client(storage_client) - -# Or pass it directly to the crawler. -crawler = ParselCrawler(storage_client=storage_client) - -# Or just provide it when opening a storage (e.g. dataset). -dataset = await Dataset.open( - name='my_dataset', - storage_client=storage_client, -) -``` + +{RegisteringStorageClientsExample} + -You can also register a different storage client for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use for example a fast in-memory storage for `RequestQueue` while persisting scraping results for `Dataset` or `KeyValueStore`. +You can also register different storage clients for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use a fast in-memory storage for `RequestQueue` while persisting scraping results in `Dataset` or `KeyValueStore`. ## Conclusion -Storage clients in Crawlee provide different backends for storages. Use `MemoryStorageClient` for testing and fast operations without persistence, or `FileSystemStorageClient` for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the `StorageClient` interface. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +Storage clients in Crawlee provide different backends for data storage. Use `MemoryStorageClient` for testing and fast operations without persistence, or `FileSystemStorageClient` for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the `StorageClient` interface. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx index 22626e7143..53ef9672c4 100644 --- a/docs/guides/storages.mdx +++ b/docs/guides/storages.mdx @@ -34,6 +34,42 @@ Crawlee's storage system consists of two main layers: For more information about storage clients and their configuration, see the [Storage clients guide](./storage-clients). +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Abstract classes +%% ======================== + +class Storage { + <> +} + +%% ======================== +%% Specific classes +%% ======================== + +class Dataset + +class KeyValueStore + +class RequestQueue + +%% ======================== +%% Inheritance arrows +%% ======================== + +Storage <|-- Dataset +Storage <|-- KeyValueStore +Storage <|-- RequestQueue +``` + ## Request queue The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition and removal of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The `RequestQueue` is highly useful for large-scale and complex crawls. @@ -185,4 +221,6 @@ Note that purging behavior may vary between storage client implementations. For ## Conclusion -This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests using the `RequestQueue` and store and retrieve scraping results using the `Dataset` and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests using the `RequestQueue` and store and retrieve scraping results using the `Dataset` and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/website/sidebars.js b/website/sidebars.js index 85a5224782..e843ac1336 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -24,6 +24,7 @@ module.exports = { { type: 'category', label: 'Guides', + collapsed: true, link: { type: 'generated-index', title: 'Guides', @@ -40,6 +41,7 @@ module.exports = { { type: 'category', label: 'Deployment', + collapsed: true, link: { type: 'generated-index', title: 'Deployment guides', @@ -73,6 +75,7 @@ module.exports = { { type: 'category', label: 'Examples', + collapsed: true, link: { type: 'generated-index', title: 'Examples', @@ -105,6 +108,7 @@ module.exports = { { type: 'category', label: 'Upgrading', + collapsed: true, link: { type: 'generated-index', title: 'Upgrading', From dc4542e7239f3742fb3d156e418ed07aa5a00d35 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 15 Jul 2025 16:18:27 +0200 Subject: [PATCH 02/11] Fix links --- docs/guides/architecture_overview.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx index 3f08fb1093..7c6d3029a8 100644 --- a/docs/guides/architecture_overview.mdx +++ b/docs/guides/architecture_overview.mdx @@ -420,6 +420,6 @@ Statistics are logged at configurable intervals in both table and inline formats ## Conclusion -In this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](../api), and [Examples](../examples) for more details on how to use these components in your own projects. +In this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](https://crawlee.dev/python/api), and [Examples](../examples) for more details on how to use these components in your own projects. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! From ac8acede880df5e0b37df8b135a8ac4f29d217ae Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 15 Jul 2025 16:32:16 +0200 Subject: [PATCH 03/11] better cover impit client --- ....py => parsel_curl_impersonate_example.py} | 8 ++-- ...tpx_example.py => parsel_httpx_example.py} | 8 ++-- .../http_clients/parsel_impit_example.py | 43 +++++++++++++++++++ docs/guides/http_clients.mdx | 28 ++++++++---- 4 files changed, 71 insertions(+), 16 deletions(-) rename docs/guides/code_examples/http_clients/{curl_impersonate_example.py => parsel_curl_impersonate_example.py} (79%) rename docs/guides/code_examples/http_clients/{httpx_example.py => parsel_httpx_example.py} (79%) create mode 100644 docs/guides/code_examples/http_clients/parsel_impit_example.py diff --git a/docs/guides/code_examples/http_clients/curl_impersonate_example.py b/docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py similarity index 79% rename from docs/guides/code_examples/http_clients/curl_impersonate_example.py rename to docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py index 28813a2f46..63030b93d7 100644 --- a/docs/guides/code_examples/http_clients/curl_impersonate_example.py +++ b/docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py @@ -1,6 +1,6 @@ import asyncio -from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import CurlImpersonateHttpClient @@ -11,7 +11,7 @@ async def main() -> None: impersonate='chrome131', ) - crawler = BeautifulSoupCrawler( + crawler = ParselCrawler( http_client=http_client, # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, @@ -19,7 +19,7 @@ async def main() -> None: # Define the default request handler, which will be called for every request. @crawler.router.default_handler - async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links from the page. @@ -28,7 +28,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # Extract data from the page. data = { 'url': context.request.url, - 'title': context.soup.title.string if context.soup.title else None, + 'title': context.selector.css('title::text').get(), } # Push the extracted data to the default dataset. diff --git a/docs/guides/code_examples/http_clients/httpx_example.py b/docs/guides/code_examples/http_clients/parsel_httpx_example.py similarity index 79% rename from docs/guides/code_examples/http_clients/httpx_example.py rename to docs/guides/code_examples/http_clients/parsel_httpx_example.py index 5c3c4883cb..8075a6d9ef 100644 --- a/docs/guides/code_examples/http_clients/httpx_example.py +++ b/docs/guides/code_examples/http_clients/parsel_httpx_example.py @@ -1,6 +1,6 @@ import asyncio -from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import HttpxHttpClient @@ -11,7 +11,7 @@ async def main() -> None: follow_redirects=True, ) - crawler = BeautifulSoupCrawler( + crawler = ParselCrawler( http_client=http_client, # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, @@ -19,7 +19,7 @@ async def main() -> None: # Define the default request handler, which will be called for every request. @crawler.router.default_handler - async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links from the page. @@ -28,7 +28,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # Extract data from the page. data = { 'url': context.request.url, - 'title': context.soup.title.string if context.soup.title else None, + 'title': context.selector.css('title::text').get(), } # Push the extracted data to the default dataset. diff --git a/docs/guides/code_examples/http_clients/parsel_impit_example.py b/docs/guides/code_examples/http_clients/parsel_impit_example.py new file mode 100644 index 0000000000..5cd90ce4a8 --- /dev/null +++ b/docs/guides/code_examples/http_clients/parsel_impit_example.py @@ -0,0 +1,43 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.http_clients import ImpitHttpClient + + +async def main() -> None: + http_client = ImpitHttpClient( + # Optional additional keyword arguments for `impit.AsyncClient`. + http3=True, + browser='firefox', + verify=True, + ) + + crawler = ParselCrawler( + http_client=http_client, + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Enqueue all links from the page. + await context.enqueue_links() + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.css('title::text').get(), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/http_clients.mdx b/docs/guides/http_clients.mdx index eaa18f9ae4..31683ef43e 100644 --- a/docs/guides/http_clients.mdx +++ b/docs/guides/http_clients.mdx @@ -9,8 +9,9 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import BsCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/curl_impersonate_example.py'; -import BsHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/httpx_example.py'; +import ParselHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_httpx_example.py'; +import ParselCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_curl_impersonate_example.py'; +import ParselImpitExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_impit_example.py'; HTTP clients are utilized by HTTP-based crawlers (e.g., `ParselCrawler` and `BeautifulSoupCrawler`) to communicate with web servers. They use external HTTP libraries for communication rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/), [curl-cffi](https://pypi.org/project/curl-cffi/), and [impit](https://apify.github.io/impit/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries include [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/), and [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but cannot execute client-side JavaScript. @@ -52,19 +53,24 @@ HttpClient <|-- ImpitHttpClient ## Switching between HTTP clients -Crawlee currently provides two main HTTP clients: `HttpxHttpClient`, which uses the `httpx` library, and `CurlImpersonateHttpClient`, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is `HttpxHttpClient`. +Crawlee currently provides three main HTTP clients: `HttpxHttpClient`, which uses the `httpx` library, `CurlImpersonateHttpClient`, which uses the `curl-cffi` library, and `ImpitHttpClient`, which uses the `impit` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is `HttpxHttpClient`. -Below are examples of how to configure the HTTP client for the `BeautifulSoupCrawler`: +Below are examples of how to configure the HTTP client for the `ParselCrawler`: - + - {BsHttpxExample} + {ParselHttpxExample} - + - {BsCurlImpersonateExample} + {ParselCurlImpersonateExample} + + + + + {ParselImpitExample} @@ -79,6 +85,12 @@ For `CurlImpersonateHttpClient``ImpitHttpClient`, you need to install Crawlee with the `impit` extra: + +```sh +python -m pip install 'crawlee[impit]' +``` + Alternatively, you can install all available extras to get access to all HTTP clients and features: ```sh From 2b0cc7f23ba35564510eb3fe022ea5bcc1b58725 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 16 Jul 2025 09:07:08 +0200 Subject: [PATCH 04/11] adaptive pw crawler naming guide/example --- docs/examples/playwright_crawler_adaptive.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/playwright_crawler_adaptive.mdx b/docs/examples/playwright_crawler_adaptive.mdx index c1f8875df8..f915f0246f 100644 --- a/docs/examples/playwright_crawler_adaptive.mdx +++ b/docs/examples/playwright_crawler_adaptive.mdx @@ -1,6 +1,6 @@ --- id: adaptive-playwright-crawler -title: AdaptivePlaywrightCrawler +title: Adaptive Playwright crawler --- import ApiLink from '@site/src/components/ApiLink'; @@ -13,7 +13,7 @@ It uses a more limited crawling context interface so that it is able to switch t A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. -For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide') +For more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler) {AdaptivePlaywrightCrawlerExample} From 06ee50fe361367a82b4b69dc12f6d26e7146b387 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 16 Jul 2025 13:11:19 +0200 Subject: [PATCH 05/11] add missing http crawler --- docs/guides/architecture_overview.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx index 7c6d3029a8..04d104dd8f 100644 --- a/docs/guides/architecture_overview.mdx +++ b/docs/guides/architecture_overview.mdx @@ -43,6 +43,8 @@ class AbstractHttpCrawler { %% Specific classes %% ======================== +class HttpCrawler + class ParselCrawler class BeautifulSoupCrawler From 8e3f0f8e4b71182d8368f1b5047e833e011acd93 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 17 Jul 2025 11:25:59 +0200 Subject: [PATCH 06/11] First feedback from Pepa --- docs/guides/architecture_overview.mdx | 28 +++++++++---------- .../request_router/advanced_routing.py | 0 .../request_router/builtin_router_approach.py | 0 .../request_router/custom_router_approach.py | 0 .../request_router/error_handlers.py | 0 docs/guides/http_clients.mdx | 6 ++-- docs/guides/http_crawlers.mdx | 8 +++--- docs/guides/request_loaders.mdx | 16 +++++------ docs/guides/storage_clients.mdx | 6 ++-- docs/guides/storages.mdx | 6 ++-- 10 files changed, 35 insertions(+), 35 deletions(-) delete mode 100644 docs/guides/code_examples/request_router/advanced_routing.py delete mode 100644 docs/guides/code_examples/request_router/builtin_router_approach.py delete mode 100644 docs/guides/code_examples/request_router/custom_router_approach.py delete mode 100644 docs/guides/code_examples/request_router/error_handlers.py diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx index 04d104dd8f..dc9eb8a470 100644 --- a/docs/guides/architecture_overview.mdx +++ b/docs/guides/architecture_overview.mdx @@ -57,12 +57,12 @@ class AdaptivePlaywrightCrawler %% Inheritance arrows %% ======================== -BasicCrawler <|-- AbstractHttpCrawler -BasicCrawler <|-- PlaywrightCrawler -BasicCrawler <|-- AdaptivePlaywrightCrawler -AbstractHttpCrawler <|-- HttpCrawler -AbstractHttpCrawler <|-- ParselCrawler -AbstractHttpCrawler <|-- BeautifulSoupCrawler +BasicCrawler --|> AbstractHttpCrawler +BasicCrawler --|> PlaywrightCrawler +BasicCrawler --|> AdaptivePlaywrightCrawler +AbstractHttpCrawler --|> HttpCrawler +AbstractHttpCrawler --|> ParselCrawler +AbstractHttpCrawler --|> BeautifulSoupCrawler ``` ### HTTP crawlers @@ -221,9 +221,9 @@ class RequestQueue %% Inheritance arrows %% ======================== -Storage <|-- Dataset -Storage <|-- KeyValueStore -Storage <|-- RequestQueue +Storage --|> Dataset +Storage --|> KeyValueStore +Storage --|> RequestQueue ``` ## Storage clients @@ -267,9 +267,9 @@ class ApifyStorageClient %% Inheritance arrows %% ======================== -StorageClient <|-- MemoryStorageClient -StorageClient <|-- FileSystemStorageClient -StorageClient <|-- ApifyStorageClient +StorageClient --|> MemoryStorageClient +StorageClient --|> FileSystemStorageClient +StorageClient --|> ApifyStorageClient ``` Storage clients can be registered globally with the `ServiceLocator` (you will learn more about the `ServiceLocator` in the next section), passed directly to crawlers, or specified when opening individual storage instances. You can also create custom storage clients by implementing the `StorageClient` interface. @@ -381,8 +381,8 @@ class ApifyEventManager %% Inheritance arrows %% ======================== -EventManager <|-- LocalEventManager -EventManager <|-- ApifyEventManager +EventManager --|> LocalEventManager +EventManager --|> ApifyEventManager ``` ## Session management diff --git a/docs/guides/code_examples/request_router/advanced_routing.py b/docs/guides/code_examples/request_router/advanced_routing.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/code_examples/request_router/builtin_router_approach.py b/docs/guides/code_examples/request_router/builtin_router_approach.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/code_examples/request_router/custom_router_approach.py b/docs/guides/code_examples/request_router/custom_router_approach.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/code_examples/request_router/error_handlers.py b/docs/guides/code_examples/request_router/error_handlers.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/http_clients.mdx b/docs/guides/http_clients.mdx index 31683ef43e..8ddf06b203 100644 --- a/docs/guides/http_clients.mdx +++ b/docs/guides/http_clients.mdx @@ -46,9 +46,9 @@ class ImpitHttpClient %% Inheritance arrows %% ======================== -HttpClient <|-- HttpxHttpClient -HttpClient <|-- CurlImpersonateHttpClient -HttpClient <|-- ImpitHttpClient +HttpClient --|> HttpxHttpClient +HttpClient --|> CurlImpersonateHttpClient +HttpClient --|> ImpitHttpClient ``` ## Switching between HTTP clients diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index a533f8b0cb..3cd29ed314 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -54,10 +54,10 @@ class BeautifulSoupCrawler %% Inheritance arrows %% ======================== -BasicCrawler <|-- AbstractHttpCrawler -AbstractHttpCrawler <|-- HttpCrawler -AbstractHttpCrawler <|-- ParselCrawler -AbstractHttpCrawler <|-- BeautifulSoupCrawler +BasicCrawler --|> AbstractHttpCrawler +AbstractHttpCrawler --|> HttpCrawler +AbstractHttpCrawler --|> ParselCrawler +AbstractHttpCrawler --|> BeautifulSoupCrawler ``` ## BeautifulSoupCrawler diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx index 31b3a0f6af..ce5c0d13cf 100644 --- a/docs/guides/request_loaders.mdx +++ b/docs/guides/request_loaders.mdx @@ -12,7 +12,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py'; import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py'; -import TlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py'; +import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py'; import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py'; import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py'; @@ -89,13 +89,13 @@ class RequestManagerTandem %% Inheritance arrows %% ======================== -Storage <|-- RequestQueue -RequestManager <|-- RequestQueue +Storage --|> RequestQueue +RequestManager --|> RequestQueue -RequestLoader <|-- RequestManager -RequestLoader <|-- RequestList -RequestLoader <|-- SitemapRequestLoader -RequestManager <|-- RequestManagerTandem +RequestLoader --|> RequestManager +RequestLoader --|> RequestList +RequestLoader --|> SitemapRequestLoader +RequestManager --|> RequestManagerTandem ``` ## Request loaders @@ -137,7 +137,7 @@ This section describes the combination of the `R - {TlExplicitTandemExample} + {RlExplicitTandemExample} diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index febf5933d9..62c1b32deb 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -56,9 +56,9 @@ class ApifyStorageClient %% Inheritance arrows %% ======================== -StorageClient <|-- MemoryStorageClient -StorageClient <|-- FileSystemStorageClient -StorageClient <|-- ApifyStorageClient +StorageClient --|> MemoryStorageClient +StorageClient --|> FileSystemStorageClient +StorageClient --|> ApifyStorageClient ``` ### Memory storage client diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx index 53ef9672c4..e9bf312c97 100644 --- a/docs/guides/storages.mdx +++ b/docs/guides/storages.mdx @@ -65,9 +65,9 @@ class RequestQueue %% Inheritance arrows %% ======================== -Storage <|-- Dataset -Storage <|-- KeyValueStore -Storage <|-- RequestQueue +Storage --|> Dataset +Storage --|> KeyValueStore +Storage --|> RequestQueue ``` ## Request queue From 906b9b6a1be5663d87c54e3b43c4807a0f943705 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 17 Jul 2025 11:49:21 +0200 Subject: [PATCH 07/11] add max requests per crawl --- .../code_examples/adaptive_playwright_crawler.py | 3 ++- .../request_loaders/rl_tandem_example.py | 5 ++++- .../request_loaders/rl_tandem_example_explicit.py | 5 ++++- .../request_loaders/sitemap_tandem_example.py | 5 ++++- .../sitemap_tandem_example_explicit.py | 5 ++++- .../request_router/adaptive_crawler_handlers.py | 10 ++++++++-- .../request_router/basic_request_handlers.py | 2 +- .../request_router/custom_router_default_only.py | 2 +- .../code_examples/request_router/error_handler.py | 4 +++- .../request_router/failed_request_handler.py | 2 +- .../request_router/http_pre_navigation.py | 14 +++++++++++--- .../request_router/playwright_pre_navigation.py | 4 +++- .../request_router/simple_default_handler.py | 4 +++- docs/introduction/03_adding_more_urls.mdx | 2 +- .../_adaptive_playwright_crawler.py | 3 ++- website/src/pages/home_page_example.py | 2 +- 16 files changed, 53 insertions(+), 19 deletions(-) diff --git a/docs/examples/code_examples/adaptive_playwright_crawler.py b/docs/examples/code_examples/adaptive_playwright_crawler.py index f2851d502b..64c96ff5cd 100644 --- a/docs/examples/code_examples/adaptive_playwright_crawler.py +++ b/docs/examples/code_examples/adaptive_playwright_crawler.py @@ -14,7 +14,8 @@ async def main() -> None: # Crawler created by following factory method will use `beautifulsoup` # for parsing static content. crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - max_requests_per_crawl=10, playwright_crawler_specific_kwargs={'headless': False} + max_requests_per_crawl=10, # Limit the max requests per crawl. + playwright_crawler_specific_kwargs={'headless': False} ) @crawler.router.default_handler diff --git a/docs/guides/code_examples/request_loaders/rl_tandem_example.py b/docs/guides/code_examples/request_loaders/rl_tandem_example.py index b0e83138ca..eddb63af9a 100644 --- a/docs/guides/code_examples/request_loaders/rl_tandem_example.py +++ b/docs/guides/code_examples/request_loaders/rl_tandem_example.py @@ -13,7 +13,10 @@ async def main() -> None: request_manager = await request_list.to_tandem() # Create a crawler and pass the request manager to it. - crawler = ParselCrawler(request_manager=request_manager) + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py index 17ba20c392..7972804d76 100644 --- a/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py +++ b/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py @@ -16,7 +16,10 @@ async def main() -> None: request_manager = RequestManagerTandem(request_list, request_queue) # Create a crawler and pass the request manager to it. - crawler = ParselCrawler(request_manager=request_manager) + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py index f43503eaf3..61608112e4 100644 --- a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py @@ -23,7 +23,10 @@ async def main() -> None: request_manager = await sitemap_loader.to_tandem() # Create a crawler and pass the request manager to it. - crawler = ParselCrawler(request_manager=request_manager) + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py index 46084f6828..e0b5a118cc 100644 --- a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py @@ -26,7 +26,10 @@ async def main() -> None: request_manager = RequestManagerTandem(sitemap_loader, request_queue) # Create a crawler and pass the request manager to it. - crawler = ParselCrawler(request_manager=request_manager) + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py index fbe0d412bd..f4d0219a8d 100644 --- a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py +++ b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py @@ -1,5 +1,6 @@ import asyncio +from crawlee import HttpHeaders from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, @@ -12,14 +13,19 @@ async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - max_requests_per_crawl=5, + max_requests_per_crawl=10, # Limit the max requests per crawl. ) # Common pre-navigation hook (runs for all requests) @crawler.pre_navigation_hook async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # This runs for both HTTP and browser requests - context.request.headers['Accept'] = 'text/html,application/xhtml+xml' + context.request.headers = HttpHeaders( + { + **context.request.headers, + 'Accept': 'text/html,application/xhtml+xml', + }, + ) # Playwright-specific pre-navigation hook (only when using browser) @crawler.pre_navigation_hook(playwright_only=True) diff --git a/docs/guides/code_examples/request_router/basic_request_handlers.py b/docs/guides/code_examples/request_router/basic_request_handlers.py index 4d67550fa4..ef88714876 100644 --- a/docs/guides/code_examples/request_router/basic_request_handlers.py +++ b/docs/guides/code_examples/request_router/basic_request_handlers.py @@ -71,7 +71,7 @@ async def product_handler(context: ParselCrawlingContext) -> None: # Create crawler with the router crawler = ParselCrawler( request_handler=router, - max_requests_per_crawl=20, + max_requests_per_crawl=10, # Limit the max requests per crawl. ) # Start crawling with some initial requests diff --git a/docs/guides/code_examples/request_router/custom_router_default_only.py b/docs/guides/code_examples/request_router/custom_router_default_only.py index 5ace98cf1b..d6768d5777 100644 --- a/docs/guides/code_examples/request_router/custom_router_default_only.py +++ b/docs/guides/code_examples/request_router/custom_router_default_only.py @@ -33,7 +33,7 @@ async def default_handler(context: ParselCrawlingContext) -> None: # Create crawler with the custom router crawler = ParselCrawler( request_handler=router, - max_requests_per_crawl=10, + max_requests_per_crawl=10, # Limit the max requests per crawl. ) # Start crawling diff --git a/docs/guides/code_examples/request_router/error_handler.py b/docs/guides/code_examples/request_router/error_handler.py index 40b7e99fa5..b240e72eca 100644 --- a/docs/guides/code_examples/request_router/error_handler.py +++ b/docs/guides/code_examples/request_router/error_handler.py @@ -9,7 +9,9 @@ async def main() -> None: # Create a crawler instance - crawler = ParselCrawler(max_requests_per_crawl=10) + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_router/failed_request_handler.py b/docs/guides/code_examples/request_router/failed_request_handler.py index 0f1634c1b1..e09940b990 100644 --- a/docs/guides/code_examples/request_router/failed_request_handler.py +++ b/docs/guides/code_examples/request_router/failed_request_handler.py @@ -6,7 +6,7 @@ async def main() -> None: # Create a crawler instance with retry settings crawler = ParselCrawler( - max_requests_per_crawl=10, + max_requests_per_crawl=10, # Limit the max requests per crawl. max_request_retries=2, # Allow 2 retries before failing ) diff --git a/docs/guides/code_examples/request_router/http_pre_navigation.py b/docs/guides/code_examples/request_router/http_pre_navigation.py index 05db6eff15..b5b3aa3310 100644 --- a/docs/guides/code_examples/request_router/http_pre_navigation.py +++ b/docs/guides/code_examples/request_router/http_pre_navigation.py @@ -1,16 +1,24 @@ import asyncio +from crawlee import HttpHeaders from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext async def main() -> None: - crawler = ParselCrawler() + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.pre_navigation_hook async def setup_request(context: BasicCrawlingContext) -> None: # Add custom headers before making the request - context.request.headers['User-Agent'] = 'Crawlee Bot 1.0' - context.request.headers['Accept'] = 'text/html,application/xhtml+xml' + context.request.headers = HttpHeaders( + { + **context.request.headers, + 'User-Agent': 'Crawlee Bot 1.0', + 'Accept': 'text/html,application/xhtml+xml', + }, + ) @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_router/playwright_pre_navigation.py b/docs/guides/code_examples/request_router/playwright_pre_navigation.py index 7940cc072c..eecea7271b 100644 --- a/docs/guides/code_examples/request_router/playwright_pre_navigation.py +++ b/docs/guides/code_examples/request_router/playwright_pre_navigation.py @@ -8,7 +8,9 @@ async def main() -> None: - crawler = PlaywrightCrawler(max_requests_per_crawl=5) + crawler = PlaywrightCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.pre_navigation_hook async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_router/simple_default_handler.py b/docs/guides/code_examples/request_router/simple_default_handler.py index e055491844..92c35651a1 100644 --- a/docs/guides/code_examples/request_router/simple_default_handler.py +++ b/docs/guides/code_examples/request_router/simple_default_handler.py @@ -5,7 +5,9 @@ async def main() -> None: # Create a crawler instance - crawler = ParselCrawler(max_requests_per_crawl=10) + crawler = ParselCrawler( + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) # Use the crawler's built-in router to define a default handler @crawler.router.default_handler diff --git a/docs/introduction/03_adding_more_urls.mdx b/docs/introduction/03_adding_more_urls.mdx index a9669fb8a3..db43311d07 100644 --- a/docs/introduction/03_adding_more_urls.mdx +++ b/docs/introduction/03_adding_more_urls.mdx @@ -43,7 +43,7 @@ The `enqueue_links` function When you're just testing your code or when your crawler could potentially find millions of links, it's very useful to set a maximum limit of crawled pages. The option is called `max_requests_per_crawl`, is available in all crawlers, and you can set it like this: ```python -crawler = BeautifulSoupCrawler(max_requests_per_crawl=20) +crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) ``` This means that no new requests will be started after the 20th request is finished. The actual number of processed requests might be a little higher thanks to parallelization, because the running requests won't be forcefully aborted. It's not even possible in most cases. diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 9a9970eba4..06ac213047 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -99,7 +99,8 @@ class AdaptivePlaywrightCrawler( from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( - max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'browser_type': 'chromium'} + max_requests_per_crawl=10, # Limit the max requests per crawl. + playwright_crawler_specific_kwargs={'browser_type': 'chromium'}, ) @crawler.router.default_handler diff --git a/website/src/pages/home_page_example.py b/website/src/pages/home_page_example.py index fe5b80eebc..60456028d1 100644 --- a/website/src/pages/home_page_example.py +++ b/website/src/pages/home_page_example.py @@ -3,7 +3,7 @@ async def main() -> None: crawler = PlaywrightCrawler( - max_requests_per_crawl=5, # Limit the crawl to 5 requests at most. + max_requests_per_crawl=10, # Limit the max requests per crawl. headless=False, # Show the browser window. browser_type='firefox', # Use the Firefox browser. ) From 2b7668ae21d05bbb6745811e052ef57534d913a8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 17 Jul 2025 11:51:38 +0200 Subject: [PATCH 08/11] linter --- docs/examples/code_examples/adaptive_playwright_crawler.py | 2 +- .../request_loaders/sitemap_tandem_example_explicit.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/examples/code_examples/adaptive_playwright_crawler.py b/docs/examples/code_examples/adaptive_playwright_crawler.py index 64c96ff5cd..904a000379 100644 --- a/docs/examples/code_examples/adaptive_playwright_crawler.py +++ b/docs/examples/code_examples/adaptive_playwright_crawler.py @@ -15,7 +15,7 @@ async def main() -> None: # for parsing static content. crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_requests_per_crawl=10, # Limit the max requests per crawl. - playwright_crawler_specific_kwargs={'headless': False} + playwright_crawler_specific_kwargs={'headless': False}, ) @crawler.router.default_handler diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py index e0b5a118cc..5089fe8902 100644 --- a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py @@ -27,9 +27,9 @@ async def main() -> None: # Create a crawler and pass the request manager to it. crawler = ParselCrawler( - request_manager=request_manager, - max_requests_per_crawl=10, # Limit the max requests per crawl. - ) + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: From 13fdabf74d711875a099fd253e30431e27d2bcbb Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 17 Jul 2025 13:41:53 +0200 Subject: [PATCH 09/11] address Honza's feedback --- docs/guides/architecture_overview.mdx | 60 +++++++------------ .../change_handle_error_status.py | 8 +-- .../adaptive_crawler_handlers.py | 7 +-- .../request_router/http_pre_navigation.py | 3 +- docs/guides/request_router.mdx | 2 + docs/guides/storage_clients.mdx | 58 +++++++++--------- docs/guides/storages.mdx | 2 +- 7 files changed, 58 insertions(+), 82 deletions(-) diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx index dc9eb8a470..81243174b8 100644 --- a/docs/guides/architecture_overview.mdx +++ b/docs/guides/architecture_overview.mdx @@ -6,11 +6,11 @@ description: An overview of the core components of the Crawlee library and its a import ApiLink from '@site/src/components/ApiLink'; -Crawlee is a modern and modular web scraping framework. It is designed for both HTTP and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system. +Crawlee is a modern and modular web scraping framework. It is designed for both HTTP-only and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system. ## Crawler -The core component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the `BasicCrawler` class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers. +The main user-facing component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the `BasicCrawler` class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers. :::info @@ -83,11 +83,11 @@ Browser crawlers use a real browser to render pages, enabling scraping of sites ### Adaptive crawler -The `AdaptivePlaywrightCrawler` sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler). +The `AdaptivePlaywrightCrawler` sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. It also provides a uniform interface for both crawling types (modes). You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler). ## Crawling contexts -Crawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, and other information related to the request. +Crawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, or use helper methods to interact with storages, and extract and enqueue new requests. ```mermaid --- @@ -102,45 +102,25 @@ classDiagram %% Classes %% ======================== -class BasicCrawlingContext { - <> -} +class BasicCrawlingContext -class HttpCrawlingContext { - <> -} +class HttpCrawlingContext -class HttpCrawlingResult { - <> -} +class HttpCrawlingResult -class ParsedHttpCrawlingContext { - <> -} +class ParsedHttpCrawlingContext -class ParselCrawlingContext { - <> -} +class ParselCrawlingContext -class BeautifulSoupCrawlingContext { - <> -} +class BeautifulSoupCrawlingContext -class PlaywrightPreNavCrawlingContext { - <> -} +class PlaywrightPreNavCrawlingContext -class PlaywrightCrawlingContext { - <> -} +class PlaywrightCrawlingContext -class AdaptivePlaywrightPreNavCrawlingContext { - <> -} +class AdaptivePlaywrightPreNavCrawlingContext -class AdaptivePlaywrightCrawlingContext { - <> -} +class AdaptivePlaywrightCrawlingContext %% ======================== %% Inheritance arrows @@ -184,9 +164,9 @@ Storages are the components that manage data in Crawlee. They provide a way to s Crawlee provides three built-in storage types for managing data: -- `Dataset` - Append-only, tabular storage that stores structured results (e.g., scraped data). -- `KeyValueStore` - Stores arbitrary data like JSON documents, images, configs, or state. -- `RequestQueue` - Manages pending and handled requests. +- `Dataset` - Append-only, tabular storage for structured data. It is ideal for storing scraping results. +- `KeyValueStore` - Storage for arbitrary data like JSON documents, images or configs. It supports get and set operations with key-value pairs; updates are only possible by replacement. +- `RequestQueue` - A managed queue for pending and completed requests, with automatic deduplication and dynamic addition of new items. It is used to track URLs for crawling. See the [Storages guide](./storages) for more details. @@ -295,7 +275,7 @@ See the [Request router guide](./request-router) for detailed information and ex ## Service locator -The `ServiceLocator` is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator acts as a dependency injection container that coordinates three essential services: +The `ServiceLocator` is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator coordinates these three services: - `Configuration` - Application-wide settings and parameters that control various aspects of Crawlee behavior. - `StorageClient` - Backend implementation for data storage across datasets, key-value stores, and request queues. @@ -315,9 +295,9 @@ Request loaders provide a subset of `RequestQue ### Request managers -`RequestManager` extends `RequestLoader` with write capabilities for adding and reclaiming requests, providing full request management functionality. `RequestQueue` is the primary concrete implementation of `RequestManager`. +`RequestManager` extends `RequestLoader` with write capabilities for adding and reclaiming requests, providing full request management functionality. `RequestQueue` is the primary concrete implementation of `RequestManager`. -`RequestManagerTandem` combines a read-only `RequestLoader` with a writable `RequestManager`, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager. +`RequestManagerTandem` combines a read-only `RequestLoader` with a writable `RequestManager`, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager. Request loaders are useful when you need to start with a predefined set of URLs. The tandem approach allows processing requests from static sources (like files or sitemaps) while maintaining the ability to add new requests dynamically. diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py index 4b69a54007..55bf5a0e61 100644 --- a/docs/guides/code_examples/error_handling/change_handle_error_status.py +++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py @@ -32,11 +32,9 @@ async def default_handler(context: HttpCrawlingContext) -> None: ) data = json.loads(await response.read()) # Add the new token to our `Request` headers - new_headers = { - **context.request.headers, - 'authorization': f'Bearer {data["access_token"]}', - } - context.request.headers = HttpHeaders(new_headers) + context.request.headers |= HttpHeaders( + {'authorization': f'Bearer {data["access_token"]}'}, + ) # Trigger a retry with our updated headers raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE) diff --git a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py index f4d0219a8d..a2a3892fcb 100644 --- a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py +++ b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py @@ -20,11 +20,8 @@ async def main() -> None: @crawler.pre_navigation_hook async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # This runs for both HTTP and browser requests - context.request.headers = HttpHeaders( - { - **context.request.headers, - 'Accept': 'text/html,application/xhtml+xml', - }, + context.request.headers |= HttpHeaders( + {'Accept': 'text/html,application/xhtml+xml'}, ) # Playwright-specific pre-navigation hook (only when using browser) diff --git a/docs/guides/code_examples/request_router/http_pre_navigation.py b/docs/guides/code_examples/request_router/http_pre_navigation.py index b5b3aa3310..84926f6fe4 100644 --- a/docs/guides/code_examples/request_router/http_pre_navigation.py +++ b/docs/guides/code_examples/request_router/http_pre_navigation.py @@ -12,9 +12,8 @@ async def main() -> None: @crawler.pre_navigation_hook async def setup_request(context: BasicCrawlingContext) -> None: # Add custom headers before making the request - context.request.headers = HttpHeaders( + context.request.headers |= HttpHeaders( { - **context.request.headers, 'User-Agent': 'Crawlee Bot 1.0', 'Accept': 'text/html,application/xhtml+xml', }, diff --git a/docs/guides/request_router.mdx b/docs/guides/request_router.mdx index 7f16232693..d9d7733abf 100644 --- a/docs/guides/request_router.mdx +++ b/docs/guides/request_router.mdx @@ -43,6 +43,8 @@ The default handler processes all requests that either lack a label or have a la Applications requiring explicit control over router configuration or router reuse across multiple crawler instances can create custom `Router` instances. Custom routers provide complete control over request routing configuration and enable modular application architecture. Router instances can be configured independently and attached to your crawler instances as needed. +You can also implement a custom request router class from scratch or by inheriting from `Router`. This allows you to define custom routing logic or manage request handlers in a different way. + {CustomRouterDefaultOnly} diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 62c1b32deb..0c2a14ffe9 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -21,9 +21,9 @@ Storage clients provide a unified interface for interacting with `MemoryStorageClient`: Stores data in memory with no persistence -- `FileSystemStorageClient`: Provides persistent file system storage with in-memory caching -- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python) +- `FileSystemStorageClient` - Provides persistent file system storage with in-memory caching. +- `MemoryStorageClient` - Stores data in memory with no persistence. +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). ```mermaid --- @@ -46,33 +46,21 @@ class StorageClient { %% Specific classes %% ======================== -class MemoryStorageClient - class FileSystemStorageClient +class MemoryStorageClient + class ApifyStorageClient %% ======================== %% Inheritance arrows %% ======================== -StorageClient --|> MemoryStorageClient StorageClient --|> FileSystemStorageClient +StorageClient --|> MemoryStorageClient StorageClient --|> ApifyStorageClient ``` -### Memory storage client - -The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is suitable for testing, development, short-lived operations where speed is prioritized over persistence. It is not recommended for production use or long-running crawls. - -:::warning Persistence limitation -The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. -::: - - -{MemoryStorageClientBasicExample} - - ### File system storage client The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses intelligent caching and batch processing for better performance while storing data in human-readable JSON format. This is the default storage client used by Crawlee when no other storage client is specified, making it ideal for large datasets and long-running operations where data persistence is required. @@ -84,13 +72,13 @@ The `FileSystemStorageClient` is not safe for concurrent access from multiple cr This storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools. -{FileSystemStorageClientBasicExample} + {FileSystemStorageClientBasicExample} Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class: -- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`): The root directory for all storage data -- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`): Whether to purge default storages on start +- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory for all storage data. +- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. Data is stored using the following directory structure: @@ -115,14 +103,26 @@ Data is stored using the following directory structure: ``` Where: -- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage -- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`) -- Files are stored directly without additional metadata files for simpler structure +- `{CRAWLEE_STORAGE_DIR}` - The root directory for local storage. +- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}` - The unique names for each storage instance (defaults to `"default"`). +- Files are stored directly without additional metadata files for simpler structure. Here is an example of how to configure the `FileSystemStorageClient`: -{FileSystemStorageClientConfigurationExample} + {FileSystemStorageClientConfigurationExample} + + +### Memory storage client + +The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is primarily suitable for testing and development, and is usually not a good fit for production use. However, in some cases where speed is prioritized over persistence, it can make sense. + +:::warning Persistence limitation +The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. +::: + + + {MemoryStorageClientBasicExample} ## Creating a custom storage client @@ -132,7 +132,7 @@ A storage client consists of two parts: the storage client factory and individua Here is an example of a custom storage client that implements the `StorageClient` interface: -{CustomStorageClientExample} + {CustomStorageClientExample} Custom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages. @@ -140,11 +140,11 @@ Custom storage clients can implement any storage logic, such as connecting to a ## Registering storage clients Storage clients can be registered in multiple ways: -- **Globally**: Using the `ServiceLocator` or passing directly to the crawler -- **Per storage**: When opening a specific storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue` +- **Globally** - Using the `ServiceLocator` or passing directly to the crawler. +- **Per storage** - When opening a specific storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue`. -{RegisteringStorageClientsExample} + {RegisteringStorageClientsExample} You can also register different storage clients for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use a fast in-memory storage for `RequestQueue` while persisting scraping results in `Dataset` or `KeyValueStore`. diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx index e9bf312c97..227b08af14 100644 --- a/docs/guides/storages.mdx +++ b/docs/guides/storages.mdx @@ -72,7 +72,7 @@ Storage --|> RequestQueue ## Request queue -The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition and removal of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The `RequestQueue` is highly useful for large-scale and complex crawls. +The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The following code demonstrates the usage of the `RequestQueue`: From a7b4f07e688fbd02f0bfe86a1129111c71124892 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 17 Jul 2025 14:24:08 +0200 Subject: [PATCH 10/11] Fix examples --- .../adaptive_crawler_handlers.py | 29 +++++++------------ .../playwright_pre_navigation.py | 7 +---- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py index a2a3892fcb..a25831753e 100644 --- a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py +++ b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py @@ -6,9 +6,6 @@ AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) -from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( - AdaptiveContextError, -) async def main() -> None: @@ -16,44 +13,38 @@ async def main() -> None: max_requests_per_crawl=10, # Limit the max requests per crawl. ) - # Common pre-navigation hook (runs for all requests) @crawler.pre_navigation_hook async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - # This runs for both HTTP and browser requests + # Common pre-navigation hook - runs for both HTTP and browser requests. context.request.headers |= HttpHeaders( {'Accept': 'text/html,application/xhtml+xml'}, ) - # Playwright-specific pre-navigation hook (only when using browser) @crawler.pre_navigation_hook(playwright_only=True) async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: - # This only runs when browser is used + # Playwright-specific pre-navigation hook - runs only when browser is used. await context.page.set_viewport_size({'width': 1280, 'height': 720}) if context.block_requests: await context.block_requests(extra_url_patterns=['*.css', '*.js']) @crawler.router.default_handler async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - try: - # Try browser-based extraction first - page = context.page - title = await page.title() - method = 'browser' - except AdaptiveContextError: - # Fallback to static parsing - title_tag = context.parsed_content.find('title') - title = title_tag.get_text() if title_tag else 'No title' - method = 'static' + # Extract title using the unified context interface + title_tag = context.parsed_content.find('title') + title = title_tag.get_text() if title_tag else None + + # Extract other data consistently across both modes + links = [a.get('href') for a in context.parsed_content.find_all('a', href=True)] await context.push_data( { 'url': context.request.url, 'title': title, - 'method': method, + 'links': links, } ) - await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': diff --git a/docs/guides/code_examples/request_router/playwright_pre_navigation.py b/docs/guides/code_examples/request_router/playwright_pre_navigation.py index eecea7271b..aab49717ee 100644 --- a/docs/guides/code_examples/request_router/playwright_pre_navigation.py +++ b/docs/guides/code_examples/request_router/playwright_pre_navigation.py @@ -44,12 +44,7 @@ async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None: @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: - # Wait for page to load - await context.page.wait_for_load_state('networkidle') - - # Extract page title title = await context.page.title() - await context.push_data( { 'url': context.request.url, @@ -57,7 +52,7 @@ async def default_handler(context: PlaywrightCrawlingContext) -> None: } ) - await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) + await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': From 846ba561c9cb2186c9a1caab3a8fbd6a3bb3ccdc Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 17 Jul 2025 14:49:17 +0200 Subject: [PATCH 11/11] . --- .../code_examples/request_router/adaptive_crawler_handlers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py index a25831753e..4814730df6 100644 --- a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py +++ b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py @@ -29,11 +29,11 @@ async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> Non @crawler.router.default_handler async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: - # Extract title using the unified context interface + # Extract title using the unified context interface. title_tag = context.parsed_content.find('title') title = title_tag.get_text() if title_tag else None - # Extract other data consistently across both modes + # Extract other data consistently across both modes. links = [a.get('href') for a in context.parsed_content.find_all('a', href=True)] await context.push_data( pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy