Skip to content

Commit 5c89e4d

Browse files
authored
feat: allow custom cookies (#77)
* feat: working new advanced job options * feat: working new advanced job options * feat: add tests for adding custom cookies/headers
1 parent ed0828a commit 5c89e4d

File tree

17 files changed

+699
-40
lines changed

17 files changed

+699
-40
lines changed

api/backend/job/models/job_options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ class JobOptions(BaseModel):
1313
proxies: list[str] = []
1414
site_map: Optional[SiteMap] = None
1515
collect_media: bool = False
16+
custom_cookies: list[dict[str, Any]] = []
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from typing import Any, Optional
2+
from urllib.parse import urlparse
3+
4+
from playwright.async_api import Page, BrowserContext
5+
6+
import logging
7+
8+
LOG = logging.getLogger(__name__)
9+
10+
11+
async def add_custom_cookies(
12+
custom_cookies: list[dict[str, Any]],
13+
url: str,
14+
context: BrowserContext,
15+
) -> None:
16+
parsed_url = urlparse(url)
17+
domain = parsed_url.netloc
18+
19+
for cookie in custom_cookies:
20+
cookie_dict = {
21+
"name": cookie.get("name", "default_name"),
22+
"value": cookie.get("value", "default_value"),
23+
"domain": domain,
24+
"path": "/",
25+
}
26+
27+
LOG.info(f"Adding cookie: {cookie_dict}")
28+
await context.add_cookies([cookie_dict]) # type: ignore
29+
30+
31+
async def add_custom_headers(
32+
custom_headers: dict[str, Any],
33+
page: Page,
34+
) -> None:
35+
await page.set_extra_http_headers(custom_headers)
36+
37+
38+
async def add_custom_items(
39+
url: str,
40+
page: Page,
41+
cookies: Optional[list[dict[str, Any]]] = None,
42+
headers: Optional[dict[str, Any]] = None,
43+
) -> None:
44+
if cookies:
45+
await add_custom_cookies(cookies, url, page.context)
46+
47+
if headers:
48+
await add_custom_headers(headers, page)

api/backend/scraping.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from api.backend.job.scraping.scraping_utils import scrape_content
1313
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
1414

15+
from api.backend.job.scraping.add_custom import add_custom_items
16+
1517
LOG = logging.getLogger(__name__)
1618

1719

@@ -44,20 +46,22 @@ async def make_site_request(
4446
proxies: Optional[list[str]] = None,
4547
site_map: Optional[dict[str, Any]] = None,
4648
collect_media: bool = False,
49+
custom_cookies: Optional[list[dict[str, Any]]] = None,
4750
):
4851
if url in visited_urls:
4952
return
5053

5154
proxy = None
55+
5256
if proxies:
5357
proxy = random.choice(proxies)
5458
LOG.info(f"Using proxy: {proxy}")
5559

5660
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
5761
page: Page = await browser.new_page()
5862

59-
if headers:
60-
await page.set_extra_http_headers(headers)
63+
# Add cookies and headers
64+
await add_custom_items(url, page, custom_cookies, headers)
6165

6266
LOG.info(f"Visiting URL: {url}")
6367

@@ -113,6 +117,7 @@ async def make_site_request(
113117
proxies=proxies,
114118
site_map=site_map,
115119
collect_media=collect_media,
120+
custom_cookies=custom_cookies,
116121
)
117122

118123

@@ -152,6 +157,7 @@ async def scrape(
152157
proxies: Optional[list[str]] = None,
153158
site_map: Optional[dict[str, Any]] = None,
154159
collect_media: bool = False,
160+
custom_cookies: Optional[list[dict[str, Any]]] = None,
155161
):
156162
visited_urls: set[str] = set()
157163
pages: set[tuple[str, str]] = set()
@@ -166,6 +172,7 @@ async def scrape(
166172
proxies=proxies,
167173
site_map=site_map,
168174
collect_media=collect_media,
175+
custom_cookies=custom_cookies,
169176
)
170177

171178
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,53 @@
11
import pytest
22
import logging
3-
from playwright.async_api import async_playwright, Error
3+
from typing import Dict
4+
from playwright.async_api import async_playwright, Cookie, Route
5+
from api.backend.job.scraping.add_custom import add_custom_items
46

57
logging.basicConfig(level=logging.DEBUG)
68
LOG = logging.getLogger(__name__)
79

810

911
@pytest.mark.asyncio
10-
async def test_proxy():
11-
proxy = "127.0.0.1:8080"
12+
async def test_add_custom_items():
13+
test_cookies = [{"name": "big", "value": "cookie"}]
14+
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
1215

1316
async with async_playwright() as p:
14-
browser = await p.firefox.launch(
15-
headless=True, proxy={"server": f"http://{proxy}"}
16-
)
17+
browser = await p.chromium.launch(headless=True)
1718
context = await browser.new_context()
1819
page = await context.new_page()
1920

20-
with pytest.raises(Error) as excinfo:
21-
await page.goto("http://example.com")
21+
# Set up request interception
22+
captured_headers: Dict[str, str] = {}
23+
24+
async def handle_route(route: Route) -> None:
25+
nonlocal captured_headers
26+
captured_headers = route.request.headers
27+
await route.continue_()
28+
29+
await page.route("**/*", handle_route)
30+
31+
await add_custom_items(
32+
url="http://example.com",
33+
page=page,
34+
cookies=test_cookies,
35+
headers=test_headers,
36+
)
37+
38+
# Navigate to example.com
39+
await page.goto("http://example.com")
40+
41+
# Verify cookies were added
42+
cookies: list[Cookie] = await page.context.cookies()
43+
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
44+
45+
assert test_cookie is not None
46+
assert test_cookie.get("value") == "cookie"
47+
assert test_cookie.get("path") == "/" # Default path should be set
48+
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
2249

23-
assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value)
50+
# Verify headers were added
51+
assert captured_headers.get("user-agent") == "test-agent"
2452

2553
await browser.close()

api/backend/worker/job_worker.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import json
23

34
from api.backend.job import get_queued_job, update_job
45
from api.backend.scraping import scrape
@@ -34,14 +35,25 @@ async def process_job():
3435
LOG.info(f"Beginning processing job: {job}.")
3536
try:
3637
_ = await update_job([job["id"]], field="status", value="Scraping")
38+
39+
proxies = job["job_options"]["proxies"]
40+
41+
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
42+
try:
43+
proxies = [json.loads(p) for p in proxies]
44+
except json.JSONDecodeError:
45+
LOG.error(f"Failed to parse proxy JSON: {proxies}")
46+
proxies = []
47+
3748
scraped = await scrape(
3849
job["url"],
3950
[Element(**j) for j in job["elements"]],
4051
job["job_options"]["custom_headers"],
4152
job["job_options"]["multi_page_scrape"],
42-
job["job_options"]["proxies"],
53+
proxies,
4354
job["job_options"]["site_map"],
4455
job["job_options"]["collect_media"],
56+
job["job_options"]["custom_cookies"],
4557
)
4658
LOG.info(
4759
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import { Box, Link, Typography } from "@mui/material";
2+
import { SetStateAction, Dispatch, useState } from "react";
3+
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
4+
import { RawJobOptions } from "@/types";
5+
6+
export type AdvancedJobOptionsProps = {
7+
jobOptions: RawJobOptions;
8+
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
9+
};
10+
11+
export const AdvancedJobOptions = ({
12+
jobOptions,
13+
setJobOptions,
14+
}: AdvancedJobOptionsProps) => {
15+
const [open, setOpen] = useState(false);
16+
return (
17+
<Box sx={{ mb: 2 }}>
18+
<Link
19+
component="button"
20+
variant="body2"
21+
onClick={() => setOpen(true)}
22+
sx={{
23+
textDecoration: "none",
24+
color: "primary.main",
25+
"&:hover": {
26+
color: "primary.dark",
27+
textDecoration: "underline",
28+
},
29+
paddingLeft: 1,
30+
display: "inline-flex",
31+
alignItems: "center",
32+
gap: 0.5,
33+
}}
34+
>
35+
<Typography variant="body2">Advanced Job Options</Typography>
36+
</Link>
37+
<AdvancedJobOptionsDialog
38+
open={open}
39+
onClose={() => setOpen(false)}
40+
jobOptions={jobOptions}
41+
setJobOptions={setJobOptions}
42+
/>
43+
</Box>
44+
);
45+
};

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy