Content-Length: 445364 | pFad | http://github.com/apify/crawlee/pull/2792/commits/e62befcbad3ef2227665cd0502f975d72259cc70

9B feat: stopping the crawlers gracefully with `BasicCrawler.stop()` by barjin · Pull Request #2792 · apify/crawlee · GitHub
Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: stopping the crawlers gracefully with BasicCrawler.stop() #2792

Merged
merged 12 commits into from
Jan 20, 2025
Prev Previous commit
Next Next commit
chore: add e2e test
  • Loading branch information
barjin committed Jan 17, 2025
commit e62befcbad3ef2227665cd0502f975d72259cc70
7 changes: 7 additions & 0 deletions test/e2e/cheerio-stop/actor/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"actorSpecification": 1,
"name": "test-cheerio-stop",
"version": "0.0",
"buildTag": "latest",
"env": null
}
8 changes: 8 additions & 0 deletions test/e2e/cheerio-stop/actor/.eslintrc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"root": true,
"extends": "../../.eslintrc.json",
"parserOptions": {
"project": "./test/e2e/cheerio-stop/actor/tsconfig.json",
"ecmaVersion": 2022
}
}
11 changes: 11 additions & 0 deletions test/e2e/cheerio-stop/actor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.idea
.DS_Store
node_modules
package-lock.json
apify_storage
crawlee_storage
storage
main.d.ts
main.d.ts.map
main.js
main.js.map
28 changes: 28 additions & 0 deletions test/e2e/cheerio-stop/actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# using multistage build, as we need dev deps to build the TS source code
FROM apify/actor-node:20-beta AS builder

# copy all files, install all dependencies (including dev deps) and build the project
COPY . ./
RUN npm install --include=dev \
&& npm run build

# create final image
FROM apify/actor-node:20-beta
# copy only necessary files
COPY --from=builder /usr/src/app/packages ./packages
COPY --from=builder /usr/src/app/package.json ./
COPY --from=builder /usr/src/app/main.js ./

# install only prod deps
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional --no-audit \
&& npm update --no-audit \
&& echo "Installed NPM packages:" \
&& (npm list --only=prod --no-optional --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version

# run compiled code
CMD npm run start:prod
30 changes: 30 additions & 0 deletions test/e2e/cheerio-stop/actor/main.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { CheerioCrawler, Dataset } from '@crawlee/cheerio';
import { Actor } from 'apify';

if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') {
// @ts-ignore
await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() });
} else {
await Actor.init();
}

let requestCount = 0;

const crawler = new CheerioCrawler();
crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => {
const { url } = request;
await enqueueLinks({
globs: ['https://crawlee.dev/docs/**'],
});

const pageTitle = $('title').first().text();
log.info(`URL: ${url} TITLE: ${pageTitle}`);
await Dataset.pushData({ url, pageTitle });

if (requestCount++ > 10) crawler.stop();
});

await crawler.run(['https://crawlee.dev/docs/quick-start']);
requestCount = 0;
await crawler.run(['https://crawlee.dev/docs/quick-start']);
await Actor.exit({ exit: Actor.isAtHome() });
35 changes: 35 additions & 0 deletions test/e2e/cheerio-stop/actor/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"name": "test-cheerio-stop",
"version": "0.0.1",
"description": "Crawler Stop Test - TypeScript",
"dependencies": {
"apify": "next",
"@apify/storage-local": "^2.1.3",
"@crawlee/basic": "file:./packages/basic-crawler",
"@crawlee/browser-pool": "file:./packages/browser-pool",
"@crawlee/http": "file:./packages/http-crawler",
"@crawlee/cheerio": "file:./packages/cheerio-crawler",
"@crawlee/core": "file:./packages/core",
"@crawlee/memory-storage": "file:./packages/memory-storage",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils"
},
"overrides": {
"apify": {
"@crawlee/core": "file:./packages/core",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils"
}
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"typescript": "^5.0.0"
},
"scripts": {
"start": "tsc && node main.js",
"start:prod": "node main.js",
"build": "tsc"
},
"type": "module",
"license": "ISC"
}
9 changes: 9 additions & 0 deletions test/e2e/cheerio-stop/actor/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "ES2022",
"target": "ES2022",
"lib": ["DOM"]
},
"include": ["./**/*.ts"]
}
12 changes: 12 additions & 0 deletions test/e2e/cheerio-stop/test.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs';

const testActorDirname = getActorTestDir(import.meta.url);
await initialize(testActorDirname);

const { stats, datasetItems } = await runActor(testActorDirname);

await expect(stats.requestsFinished < 20, 'crawler.stop() works');

const visitedUrls = new Set(datasetItems.map((x) => x.url));

await expect(visitedUrls.size < 20, 'crawler.stop() is by default stateless');








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/apify/crawlee/pull/2792/commits/e62befcbad3ef2227665cd0502f975d72259cc70

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy