Skip to content

docs: Add architecture overview guide and more #1306

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 18, 2025
4 changes: 2 additions & 2 deletions docs/examples/playwright_crawler_adaptive.mdx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
id: adaptive-playwright-crawler
title: AdaptivePlaywrightCrawler
title: Adaptive Playwright crawler
---

import ApiLink from '@site/src/components/ApiLink';
Expand All @@ -13,7 +13,7 @@ It uses a more limited crawling context interface so that it is able to switch t

A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler.

For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide')
For more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler)

<RunnableCodeBlock className="language-python" language="python">
{AdaptivePlaywrightCrawlerExample}
Expand Down
427 changes: 427 additions & 0 deletions docs/guides/architecture_overview.mdx

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import CurlImpersonateHttpClient


Expand All @@ -11,15 +11,15 @@ async def main() -> None:
impersonate='chrome131',
)

crawler = BeautifulSoupCrawler(
crawler = ParselCrawler(
http_client=http_client,
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links from the page.
Expand All @@ -28,7 +28,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'title': context.selector.css('title::text').get(),
}

# Push the extracted data to the default dataset.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient


Expand All @@ -11,15 +11,15 @@ async def main() -> None:
follow_redirects=True,
)

crawler = BeautifulSoupCrawler(
crawler = ParselCrawler(
http_client=http_client,
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links from the page.
Expand All @@ -28,7 +28,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'title': context.selector.css('title::text').get(),
}

# Push the extracted data to the default dataset.
Expand Down
43 changes: 43 additions & 0 deletions docs/guides/code_examples/http_clients/parsel_impit_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import ImpitHttpClient


async def main() -> None:
http_client = ImpitHttpClient(
# Optional additional keyword arguments for `impit.AsyncClient`.
http3=True,
browser='firefox',
verify=True,
)

crawler = ParselCrawler(
http_client=http_client,
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links from the page.
await context.enqueue_links()

# Extract data from the page.
data = {
'url': context.request.url,
'title': context.selector.css('title::text').get(),
}

# Push the extracted data to the default dataset.
await context.push_data(data)

# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
35 changes: 35 additions & 0 deletions docs/guides/code_examples/http_crawlers/beautifulsoup_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
# Create a BeautifulSoupCrawler instance
crawler = BeautifulSoupCrawler(
# Limit the crawl to 10 requests
max_requests_per_crawl=10,
)

# Define the default request handler
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# Extract data using BeautifulSoup
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}

# Push extracted data to the dataset
await context.push_data(data)

# Enqueue links found on the page for further crawling
await context.enqueue_links()

# Run the crawler
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
Empty file.
52 changes: 52 additions & 0 deletions docs/guides/code_examples/http_crawlers/http_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import asyncio
import re

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
# Create an HttpCrawler instance - no automatic parsing
crawler = HttpCrawler(
# Limit the crawl to 10 requests
max_requests_per_crawl=10,
)

# Define the default request handler
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# Get the raw response content
response_body = await context.http_response.read()
response_text = response_body.decode('utf-8')

# Extract title manually using regex (since we don't have a parser)
title_match = re.search(
r'<title[^>]*>([^<]+)</title>', response_text, re.IGNORECASE
)
title = title_match.group(1).strip() if title_match else None

# Extract basic information
data = {
'url': context.request.url,
'title': title,
}

# Push extracted data to the dataset
await context.push_data(data)

# Simple link extraction for further crawling
href_pattern = r'href=["\']([^"\']+)["\']'
matches = re.findall(href_pattern, response_text, re.IGNORECASE)

# Enqueue first few links found (limit to avoid too many requests)
for href in matches[:3]:
if href.startswith('http') and 'crawlee.dev' in href:
await context.add_requests([href])

# Run the crawler
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
35 changes: 35 additions & 0 deletions docs/guides/code_examples/http_crawlers/parsel_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


async def main() -> None:
# Create a ParselCrawler instance
crawler = ParselCrawler(
# Limit the crawl to 10 requests
max_requests_per_crawl=10,
)

# Define the default request handler
@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# Extract data using Parsel's XPath and CSS selectors
data = {
'url': context.request.url,
'title': context.selector.xpath('//title/text()').get(),
}

# Push extracted data to the dataset
await context.push_data(data)

# Enqueue links found on the page for further crawling
await context.enqueue_links()

# Run the crawler
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import asyncio
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.request_loaders import SitemapRequestLoader


async def main() -> None:
# Create an HTTP client for fetching sitemaps
async with HttpxHttpClient() as http_client:
# Create a sitemap request loader with URL filtering
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
# Include only URLs that contain 'docs'
include=[re.compile(r'.*docs.*')],
max_buffer_size=500, # Buffer up to 500 URLs in memory
)

# Convert the sitemap loader to a request manager using the to_tandem method.
# It is a tandem with the default request queue.
request_manager = await sitemap_loader.to_tandem()

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(request_manager=request_manager)

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
# New links will be enqueued directly to the queue.
await context.enqueue_links()

await crawler.run()


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import asyncio
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader
from crawlee.storages import RequestQueue


async def main() -> None:
# Create an HTTP client for fetching sitemaps
async with HttpxHttpClient() as http_client:
# Create a sitemap request loader with URL filtering
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
# Include only URLs that contain 'docs'
include=[re.compile(r'.*docs.*')],
max_buffer_size=500, # Buffer up to 500 URLs in memory
)

# Open the default request queue.
request_queue = await RequestQueue.open()

# And combine them together to a single request manager.
request_manager = RequestManagerTandem(sitemap_loader, request_queue)

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(request_manager=request_manager)

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
# New links will be enqueued directly to the queue.
await context.enqueue_links()

await crawler.run()


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import asyncio

from crawlee.crawlers import (
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
)
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
AdaptiveContextError,
)


async def main() -> None:
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
max_requests_per_crawl=5,
)

# Common pre-navigation hook (runs for all requests)
@crawler.pre_navigation_hook
async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
# This runs for both HTTP and browser requests
context.request.headers['Accept'] = 'text/html,application/xhtml+xml'

# Playwright-specific pre-navigation hook (only when using browser)
@crawler.pre_navigation_hook(playwright_only=True)
async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
# This only runs when browser is used
await context.page.set_viewport_size({'width': 1280, 'height': 720})
if context.block_requests:
await context.block_requests(extra_url_patterns=['*.css', '*.js'])

@crawler.router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
try:
# Try browser-based extraction first
page = context.page
title = await page.title()
method = 'browser'
except AdaptiveContextError:
# Fallback to static parsing
title_tag = context.parsed_content.find('title')
title = title_tag.get_text() if title_tag else 'No title'
method = 'static'

await context.push_data(
{
'url': context.request.url,
'title': title,
'method': method,
}
)

await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])


if __name__ == '__main__':
asyncio.run(main())
Empty file.
Loading
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy