apify · vdusek · Jul 18, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/docs/examples/playwright_crawler_adaptive.mdx b/docs/examples/playwright_crawler_adaptive.mdx
@@ -1,6 +1,6 @@
 ---
 id: adaptive-playwright-crawler
-title: AdaptivePlaywrightCrawler
+title: Adaptive Playwright crawler
 ---
 
 import ApiLink from '@site/src/components/ApiLink';
@@ -13,7 +13,7 @@ It uses a more limited crawling context interface so that it is able to switch t
 
 A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler.
 
-For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide')
+For more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler)
 
 <RunnableCodeBlock className="language-python" language="python">
     {AdaptivePlaywrightCrawlerExample}

diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx
diff --git a/.../http_clients/curl_impersonate_example.py → ...lients/parsel_curl_impersonate_example.py b/.../http_clients/curl_impersonate_example.py → ...lients/parsel_curl_impersonate_example.py
@@ -1,6 +1,6 @@
 import asyncio
 
-from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 from crawlee.http_clients import CurlImpersonateHttpClient
 
 
@@ -11,15 +11,15 @@ async def main() -> None:
         impersonate='chrome131',
     )
 
-    crawler = BeautifulSoupCrawler(
+    crawler = ParselCrawler(
         http_client=http_client,
         # Limit the crawl to max requests. Remove or increase it for crawling all links.
         max_requests_per_crawl=10,
     )
 
     # Define the default request handler, which will be called for every request.
     @crawler.router.default_handler
-    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+    async def request_handler(context: ParselCrawlingContext) -> None:
         context.log.info(f'Processing {context.request.url} ...')
 
         # Enqueue all links from the page.
@@ -28,7 +28,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         # Extract data from the page.
         data = {
             'url': context.request.url,
-            'title': context.soup.title.string if context.soup.title else None,
+            'title': context.selector.css('title::text').get(),
         }
 
         # Push the extracted data to the default dataset.

diff --git a/...de_examples/http_clients/httpx_example.py → ...ples/http_clients/parsel_httpx_example.py b/...de_examples/http_clients/httpx_example.py → ...ples/http_clients/parsel_httpx_example.py
@@ -1,6 +1,6 @@
 import asyncio
 
-from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 from crawlee.http_clients import HttpxHttpClient
 
 
@@ -11,15 +11,15 @@ async def main() -> None:
         follow_redirects=True,
     )
 
-    crawler = BeautifulSoupCrawler(
+    crawler = ParselCrawler(
         http_client=http_client,
         # Limit the crawl to max requests. Remove or increase it for crawling all links.
         max_requests_per_crawl=10,
     )
 
     # Define the default request handler, which will be called for every request.
     @crawler.router.default_handler
-    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+    async def request_handler(context: ParselCrawlingContext) -> None:
         context.log.info(f'Processing {context.request.url} ...')
 
         # Enqueue all links from the page.
@@ -28,7 +28,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         # Extract data from the page.
         data = {
             'url': context.request.url,
-            'title': context.soup.title.string if context.soup.title else None,
+            'title': context.selector.css('title::text').get(),
         }
 
         # Push the extracted data to the default dataset.

diff --git a/docs/guides/code_examples/http_clients/parsel_impit_example.py b/docs/guides/code_examples/http_clients/parsel_impit_example.py
@@ -0,0 +1,43 @@
+import asyncio
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+from crawlee.http_clients import ImpitHttpClient
+
+
+async def main() -> None:
+    http_client = ImpitHttpClient(
+        # Optional additional keyword arguments for `impit.AsyncClient`.
+        http3=True,
+        browser='firefox',
+        verify=True,
+    )
+
+    crawler = ParselCrawler(
+        http_client=http_client,
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Enqueue all links from the page.
+        await context.enqueue_links()
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': context.selector.css('title::text').get(),
+        }
+
+        # Push the extracted data to the default dataset.
+        await context.push_data(data)
+
+    # Run the crawler with the initial list of URLs.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/beautifulsoup_example.py b/docs/guides/code_examples/http_crawlers/beautifulsoup_example.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    # Create a BeautifulSoupCrawler instance
+    crawler = BeautifulSoupCrawler(
+        # Limit the crawl to 10 requests
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
+        # Extract data using BeautifulSoup
+        data = {
+            'url': context.request.url,
+            'title': context.soup.title.string if context.soup.title else None,
+        }
+
+        # Push extracted data to the dataset
+        await context.push_data(data)
+
+        # Enqueue links found on the page for further crawling
+        await context.enqueue_links()
+
+    # Run the crawler
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/custom_crawler_example.py b/docs/guides/code_examples/http_crawlers/custom_crawler_example.py
diff --git a/docs/guides/code_examples/http_crawlers/http_example.py b/docs/guides/code_examples/http_crawlers/http_example.py
@@ -0,0 +1,52 @@
+import asyncio
+import re
+
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    # Create an HttpCrawler instance - no automatic parsing
+    crawler = HttpCrawler(
+        # Limit the crawl to 10 requests
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
+        # Get the raw response content
+        response_body = await context.http_response.read()
+        response_text = response_body.decode('utf-8')
+
+        # Extract title manually using regex (since we don't have a parser)
+        title_match = re.search(
+            r'<title[^>]*>([^<]+)</title>', response_text, re.IGNORECASE
+        )
+        title = title_match.group(1).strip() if title_match else None
+
+        # Extract basic information
+        data = {
+            'url': context.request.url,
+            'title': title,
+        }
+
+        # Push extracted data to the dataset
+        await context.push_data(data)
+
+        # Simple link extraction for further crawling
+        href_pattern = r'href=["\']([^"\']+)["\']'
+        matches = re.findall(href_pattern, response_text, re.IGNORECASE)
+
+        # Enqueue first few links found (limit to avoid too many requests)
+        for href in matches[:3]:
+            if href.startswith('http') and 'crawlee.dev' in href:
+                await context.add_requests([href])
+
+    # Run the crawler
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/parsel_example.py b/docs/guides/code_examples/http_crawlers/parsel_example.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+
+
+async def main() -> None:
+    # Create a ParselCrawler instance
+    crawler = ParselCrawler(
+        # Limit the crawl to 10 requests
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
+        # Extract data using Parsel's XPath and CSS selectors
+        data = {
+            'url': context.request.url,
+            'title': context.selector.xpath('//title/text()').get(),
+        }
+
+        # Push extracted data to the dataset
+        await context.push_data(data)
+
+        # Enqueue links found on the page for further crawling
+        await context.enqueue_links()
+
+    # Run the crawler
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/...xamples/request_loaders/tandem_example.py → ...ples/request_loaders/rl_tandem_example.py b/...xamples/request_loaders/tandem_example.py → ...ples/request_loaders/rl_tandem_example.py
diff --git a/...equest_loaders/tandem_example_explicit.py → ...est_loaders/rl_tandem_example_explicit.py b/...equest_loaders/tandem_example_explicit.py → ...est_loaders/rl_tandem_example_explicit.py
diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py
@@ -0,0 +1,37 @@
+import asyncio
+import re
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+from crawlee.http_clients import HttpxHttpClient
+from crawlee.request_loaders import SitemapRequestLoader
+
+
+async def main() -> None:
+    # Create an HTTP client for fetching sitemaps
+    async with HttpxHttpClient() as http_client:
+        # Create a sitemap request loader with URL filtering
+        sitemap_loader = SitemapRequestLoader(
+            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+            http_client=http_client,
+            # Include only URLs that contain 'docs'
+            include=[re.compile(r'.*docs.*')],
+            max_buffer_size=500,  # Buffer up to 500 URLs in memory
+        )
+
+        # Convert the sitemap loader to a request manager using the to_tandem method.
+        # It is a tandem with the default request queue.
+        request_manager = await sitemap_loader.to_tandem()
+
+        # Create a crawler and pass the request manager to it.
+        crawler = ParselCrawler(request_manager=request_manager)
+
+        @crawler.router.default_handler
+        async def handler(context: ParselCrawlingContext) -> None:
+            # New links will be enqueued directly to the queue.
+            await context.enqueue_links()
+
+        await crawler.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py
@@ -0,0 +1,40 @@
+import asyncio
+import re
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+from crawlee.http_clients import HttpxHttpClient
+from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader
+from crawlee.storages import RequestQueue
+
+
+async def main() -> None:
+    # Create an HTTP client for fetching sitemaps
+    async with HttpxHttpClient() as http_client:
+        # Create a sitemap request loader with URL filtering
+        sitemap_loader = SitemapRequestLoader(
+            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+            http_client=http_client,
+            # Include only URLs that contain 'docs'
+            include=[re.compile(r'.*docs.*')],
+            max_buffer_size=500,  # Buffer up to 500 URLs in memory
+        )
+
+        # Open the default request queue.
+        request_queue = await RequestQueue.open()
+
+        # And combine them together to a single request manager.
+        request_manager = RequestManagerTandem(sitemap_loader, request_queue)
+
+        # Create a crawler and pass the request manager to it.
+        crawler = ParselCrawler(request_manager=request_manager)
+
+        @crawler.router.default_handler
+        async def handler(context: ParselCrawlingContext) -> None:
+            # New links will be enqueued directly to the queue.
+            await context.enqueue_links()
+
+        await crawler.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py b/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py
@@ -0,0 +1,57 @@
+import asyncio
+
+from crawlee.crawlers import (
+    AdaptivePlaywrightCrawler,
+    AdaptivePlaywrightCrawlingContext,
+    AdaptivePlaywrightPreNavCrawlingContext,
+)
+from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
+    AdaptiveContextError,
+)
+
+
+async def main() -> None:
+    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
+        max_requests_per_crawl=5,
+    )
+
+    # Common pre-navigation hook (runs for all requests)
+    @crawler.pre_navigation_hook
+    async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+        # This runs for both HTTP and browser requests
+        context.request.headers['Accept'] = 'text/html,application/xhtml+xml'
+
+    # Playwright-specific pre-navigation hook (only when using browser)
+    @crawler.pre_navigation_hook(playwright_only=True)
+    async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+        # This only runs when browser is used
+        await context.page.set_viewport_size({'width': 1280, 'height': 720})
+        if context.block_requests:
+            await context.block_requests(extra_url_patterns=['*.css', '*.js'])
+
+    @crawler.router.default_handler
+    async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
+        try:
+            # Try browser-based extraction first
+            page = context.page
+            title = await page.title()
+            method = 'browser'
+        except AdaptiveContextError:
+            # Fallback to static parsing
+            title_tag = context.parsed_content.find('title')
+            title = title_tag.get_text() if title_tag else 'No title'
+            method = 'static'
+
+        await context.push_data(
+            {
+                'url': context.request.url,
+                'title': title,
+                'method': method,
+            }
+        )
+
+    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/request_router/advanced_routing.py b/docs/guides/code_examples/request_router/advanced_routing.py