-
Notifications
You must be signed in to change notification settings - Fork 414
docs: Add architecture overview guide and more #1306
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 5 commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
2f180ce
Docs: Add architecture overview guide and more
vdusek dc4542e
Fix links
vdusek ac8aced
better cover impit client
vdusek 2b0cc7f
adaptive pw crawler naming guide/example
vdusek 06ee50f
add missing http crawler
vdusek 8e3f0f8
First feedback from Pepa
vdusek 906b9b6
add max requests per crawl
vdusek 2b7668a
linter
vdusek 13fdabf
address Honza's feedback
vdusek a7b4f07
Fix examples
vdusek 846ba56
.
vdusek 4e273f1
Merge branch 'master' into add-architecture-overview
vdusek 9d9e7e3
Merge branch 'master' into add-architecture-overview
vdusek File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
docs/guides/code_examples/http_clients/parsel_impit_example.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import asyncio | ||
|
||
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext | ||
from crawlee.http_clients import ImpitHttpClient | ||
|
||
|
||
async def main() -> None: | ||
http_client = ImpitHttpClient( | ||
# Optional additional keyword arguments for `impit.AsyncClient`. | ||
http3=True, | ||
browser='firefox', | ||
verify=True, | ||
) | ||
|
||
crawler = ParselCrawler( | ||
http_client=http_client, | ||
# Limit the crawl to max requests. Remove or increase it for crawling all links. | ||
max_requests_per_crawl=10, | ||
) | ||
|
||
# Define the default request handler, which will be called for every request. | ||
@crawler.router.default_handler | ||
async def request_handler(context: ParselCrawlingContext) -> None: | ||
context.log.info(f'Processing {context.request.url} ...') | ||
|
||
# Enqueue all links from the page. | ||
await context.enqueue_links() | ||
|
||
# Extract data from the page. | ||
data = { | ||
'url': context.request.url, | ||
'title': context.selector.css('title::text').get(), | ||
} | ||
|
||
# Push the extracted data to the default dataset. | ||
await context.push_data(data) | ||
|
||
# Run the crawler with the initial list of URLs. | ||
await crawler.run(['https://crawlee.dev']) | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
35 changes: 35 additions & 0 deletions
35
docs/guides/code_examples/http_crawlers/beautifulsoup_example.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import asyncio | ||
|
||
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext | ||
|
||
|
||
async def main() -> None: | ||
# Create a BeautifulSoupCrawler instance | ||
crawler = BeautifulSoupCrawler( | ||
# Limit the crawl to 10 requests | ||
max_requests_per_crawl=10, | ||
) | ||
|
||
# Define the default request handler | ||
@crawler.router.default_handler | ||
async def request_handler(context: BeautifulSoupCrawlingContext) -> None: | ||
context.log.info(f'Processing {context.request.url}') | ||
|
||
# Extract data using BeautifulSoup | ||
data = { | ||
'url': context.request.url, | ||
'title': context.soup.title.string if context.soup.title else None, | ||
} | ||
|
||
# Push extracted data to the dataset | ||
await context.push_data(data) | ||
|
||
# Enqueue links found on the page for further crawling | ||
await context.enqueue_links() | ||
|
||
# Run the crawler | ||
await crawler.run(['https://crawlee.dev']) | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import asyncio | ||
import re | ||
|
||
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext | ||
|
||
|
||
async def main() -> None: | ||
# Create an HttpCrawler instance - no automatic parsing | ||
crawler = HttpCrawler( | ||
# Limit the crawl to 10 requests | ||
max_requests_per_crawl=10, | ||
) | ||
|
||
# Define the default request handler | ||
@crawler.router.default_handler | ||
async def request_handler(context: HttpCrawlingContext) -> None: | ||
context.log.info(f'Processing {context.request.url}') | ||
|
||
# Get the raw response content | ||
response_body = await context.http_response.read() | ||
response_text = response_body.decode('utf-8') | ||
|
||
# Extract title manually using regex (since we don't have a parser) | ||
title_match = re.search( | ||
r'<title[^>]*>([^<]+)</title>', response_text, re.IGNORECASE | ||
) | ||
title = title_match.group(1).strip() if title_match else None | ||
|
||
# Extract basic information | ||
data = { | ||
'url': context.request.url, | ||
'title': title, | ||
} | ||
|
||
# Push extracted data to the dataset | ||
await context.push_data(data) | ||
|
||
# Simple link extraction for further crawling | ||
href_pattern = r'href=["\']([^"\']+)["\']' | ||
matches = re.findall(href_pattern, response_text, re.IGNORECASE) | ||
|
||
# Enqueue first few links found (limit to avoid too many requests) | ||
for href in matches[:3]: | ||
if href.startswith('http') and 'crawlee.dev' in href: | ||
await context.add_requests([href]) | ||
|
||
# Run the crawler | ||
await crawler.run(['https://crawlee.dev']) | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import asyncio | ||
|
||
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext | ||
|
||
|
||
async def main() -> None: | ||
# Create a ParselCrawler instance | ||
crawler = ParselCrawler( | ||
# Limit the crawl to 10 requests | ||
max_requests_per_crawl=10, | ||
) | ||
|
||
# Define the default request handler | ||
@crawler.router.default_handler | ||
async def request_handler(context: ParselCrawlingContext) -> None: | ||
context.log.info(f'Processing {context.request.url}') | ||
|
||
# Extract data using Parsel's XPath and CSS selectors | ||
data = { | ||
'url': context.request.url, | ||
'title': context.selector.xpath('//title/text()').get(), | ||
} | ||
|
||
# Push extracted data to the dataset | ||
await context.push_data(data) | ||
|
||
# Enqueue links found on the page for further crawling | ||
await context.enqueue_links() | ||
|
||
# Run the crawler | ||
await crawler.run(['https://crawlee.dev']) | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
vdusek marked this conversation as resolved.
Show resolved
Hide resolved
|
File renamed without changes.
vdusek marked this conversation as resolved.
Show resolved
Hide resolved
|
File renamed without changes.
37 changes: 37 additions & 0 deletions
37
docs/guides/code_examples/request_loaders/sitemap_tandem_example.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import asyncio | ||
import re | ||
|
||
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext | ||
from crawlee.http_clients import HttpxHttpClient | ||
from crawlee.request_loaders import SitemapRequestLoader | ||
|
||
|
||
async def main() -> None: | ||
# Create an HTTP client for fetching sitemaps | ||
async with HttpxHttpClient() as http_client: | ||
# Create a sitemap request loader with URL filtering | ||
sitemap_loader = SitemapRequestLoader( | ||
sitemap_urls=['https://crawlee.dev/sitemap.xml'], | ||
http_client=http_client, | ||
# Include only URLs that contain 'docs' | ||
include=[re.compile(r'.*docs.*')], | ||
max_buffer_size=500, # Buffer up to 500 URLs in memory | ||
) | ||
|
||
# Convert the sitemap loader to a request manager using the to_tandem method. | ||
# It is a tandem with the default request queue. | ||
request_manager = await sitemap_loader.to_tandem() | ||
|
||
# Create a crawler and pass the request manager to it. | ||
crawler = ParselCrawler(request_manager=request_manager) | ||
vdusek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@crawler.router.default_handler | ||
async def handler(context: ParselCrawlingContext) -> None: | ||
# New links will be enqueued directly to the queue. | ||
await context.enqueue_links() | ||
|
||
await crawler.run() | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
40 changes: 40 additions & 0 deletions
40
docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import asyncio | ||
import re | ||
|
||
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext | ||
from crawlee.http_clients import HttpxHttpClient | ||
from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader | ||
from crawlee.storages import RequestQueue | ||
|
||
|
||
async def main() -> None: | ||
# Create an HTTP client for fetching sitemaps | ||
async with HttpxHttpClient() as http_client: | ||
# Create a sitemap request loader with URL filtering | ||
sitemap_loader = SitemapRequestLoader( | ||
sitemap_urls=['https://crawlee.dev/sitemap.xml'], | ||
http_client=http_client, | ||
# Include only URLs that contain 'docs' | ||
include=[re.compile(r'.*docs.*')], | ||
max_buffer_size=500, # Buffer up to 500 URLs in memory | ||
) | ||
|
||
# Open the default request queue. | ||
request_queue = await RequestQueue.open() | ||
|
||
# And combine them together to a single request manager. | ||
request_manager = RequestManagerTandem(sitemap_loader, request_queue) | ||
|
||
# Create a crawler and pass the request manager to it. | ||
crawler = ParselCrawler(request_manager=request_manager) | ||
vdusek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@crawler.router.default_handler | ||
async def handler(context: ParselCrawlingContext) -> None: | ||
# New links will be enqueued directly to the queue. | ||
await context.enqueue_links() | ||
|
||
await crawler.run() | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
57 changes: 57 additions & 0 deletions
57
docs/guides/code_examples/request_router/adaptive_crawler_handlers.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import asyncio | ||
|
||
from crawlee.crawlers import ( | ||
AdaptivePlaywrightCrawler, | ||
AdaptivePlaywrightCrawlingContext, | ||
AdaptivePlaywrightPreNavCrawlingContext, | ||
) | ||
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( | ||
AdaptiveContextError, | ||
) | ||
|
||
|
||
async def main() -> None: | ||
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( | ||
max_requests_per_crawl=5, | ||
) | ||
|
||
# Common pre-navigation hook (runs for all requests) | ||
@crawler.pre_navigation_hook | ||
async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: | ||
# This runs for both HTTP and browser requests | ||
context.request.headers['Accept'] = 'text/html,application/xhtml+xml' | ||
|
||
# Playwright-specific pre-navigation hook (only when using browser) | ||
@crawler.pre_navigation_hook(playwright_only=True) | ||
async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: | ||
# This only runs when browser is used | ||
await context.page.set_viewport_size({'width': 1280, 'height': 720}) | ||
if context.block_requests: | ||
await context.block_requests(extra_url_patterns=['*.css', '*.js']) | ||
|
||
@crawler.router.default_handler | ||
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: | ||
vdusek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
try: | ||
# Try browser-based extraction first | ||
page = context.page | ||
title = await page.title() | ||
method = 'browser' | ||
except AdaptiveContextError: | ||
# Fallback to static parsing | ||
title_tag = context.parsed_content.find('title') | ||
title = title_tag.get_text() if title_tag else 'No title' | ||
method = 'static' | ||
|
||
await context.push_data( | ||
{ | ||
'url': context.request.url, | ||
'title': title, | ||
'method': method, | ||
} | ||
) | ||
|
||
await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) | ||
|
||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) |
Empty file.
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.