Coverage for human_requests/session.py: 88%
157 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-13 21:41 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-13 21:41 +0000
1"""
2core.session — unified stateful session for *curl_cffi* and *Playwright*-compatible engines.
4Main Methods
5============
6* ``Session.request`` — low-level HTTP request (curl_cffi) with cookie jar.
7* ``Session.goto_page`` — opens a URL in the browser, returns a Page inside
8 a context manager; upon exit synchronizes cookies + localStorage.
9* ``Response.render`` — offline render of a pre-fetched Response.
11Optional Dependencies
12=====================
13- playwright-stealth: enabled via `playwright_stealth=True`.
14 If the package is not installed and the flag is set — raises RuntimeError
15 with installation instructions.
16- camoufox: selected with `browser='camoufox'`.
17- patchright: selected with `browser='patchright'`.
18- Incompatibility: camoufox/patchright + playwright_stealth cannot be used together.
19 Raises RuntimeError.
22Additional
23==========
24- Browser launch arguments are assembled via `make_browser_launch_opts()` from:
25 - `browser_launch_opts` (arbitrary dict)
26 - `headless` (always overrides the key of the same name)
27 - `proxy` (string URL or dict) → adapted for Playwright/Patchright/Camoufox
28- Proxy is also applied to curl_cffi (if no custom `proxy` is passed in .request()).
29"""
31from __future__ import annotations
33from contextlib import asynccontextmanager
34from time import perf_counter
35from types import TracebackType
36from typing import Any, AsyncGenerator, Literal, Mapping, Optional, cast
37from urllib.parse import urlsplit
39from curl_cffi import requests as cffi_requests
40from playwright.async_api import BrowserContext, Page
41from playwright.async_api import Request as PWRequest
42from playwright.async_api import Route
44from .abstraction.cookies import CookieManager
45from .abstraction.http import URL, HttpMethod
46from .abstraction.proxy_manager import ParsedProxy
47from .abstraction.request import Request
48from .abstraction.response import Response
49from .browsers import BrowserMaster, Engine
50from .impersonation import ImpersonationConfig
51from .tools.helper_tools import (
52 build_storage_state_for_context,
53 handle_nav_with_retries,
54 merge_storage_state_from_context,
55)
56from .tools.http_utils import (
57 collect_set_cookie_headers,
58 compose_cookie_header,
59 guess_encoding,
60 parse_set_cookie,
61)
63__all__ = ["Session"]
66class Session:
67 """curl_cffi.AsyncSession + BrowserMaster + CookieManager."""
69 def __init__(
70 self,
71 *,
72 timeout: float = 15.0,
73 headless: bool = True,
74 browser: Engine = "chromium",
75 spoof: ImpersonationConfig | None = None,
76 playwright_stealth: bool = True,
77 page_retry: int = 2,
78 direct_retry: int = 1,
79 browser_launch_opts: Mapping[str, Any] = {},
80 proxy: str | None = None,
81 ) -> None:
82 """
83 Args:
84 timeout: default timeout for both direct and goto requests
85 headless: launch mode (passed into browser launch arguments)
86 browser: chromium/firefox/webkit — standard; camoufox/patchright — special builds
87 spoof: configuration for direct requests
88 playwright_stealth: hides certain automation browser signatures
89 page_retry: number of "soft" retries for page navigation (after the initial attempt)
90 direct_retry: retries for direct requests on curl_cffi Timeout (after first attempt)
91 """
92 self.timeout: float = timeout
93 """Timeout for goto/direct requests."""
95 self.headless: bool = bool(headless)
96 """Whether to run the browser in headless mode."""
98 self.browser_name: Engine = browser
99 """Current browser (chromium/firefox/webkit/camoufox/patchright)."""
101 self.spoof: ImpersonationConfig = spoof or ImpersonationConfig()
102 """Impersonation settings (user-agent, TLS, client-hello)."""
104 self.playwright_stealth: bool = bool(playwright_stealth)
105 """Hide certain automation signatures?
106 Implemented via JS injection. Some sites may detect this."""
108 self.page_retry: int = int(page_retry)
109 """If a timeout occurs after N seconds — retry with page.reload()."""
111 self.direct_retry: int = int(direct_retry)
112 """If a timeout occurs after N seconds — retry the direct request."""
114 if self.browser_name in ("camoufox", "patchright") and self.playwright_stealth:
115 raise RuntimeError(
116 "playwright_stealth=True is incompatible with browser='camoufox'/'patchright'. "
117 "Disable stealth or use chromium/firefox/webkit."
118 )
120 # Custom browser launch parameters + proxy
121 self.browser_launch_opts: Mapping[str, Any] = browser_launch_opts
122 """Browser launch arguments (arbitrary keys)."""
124 self.proxy: str | dict[str, str] | None = proxy
125 """
126 Proxy server, one of:
128 a. URL string in the form: `schema://user:pass@host:port`
130 b. playwright-like dict
131 """
133 # Cookie/localStorage state
134 self.cookies: CookieManager = CookieManager([])
135 """Storage of all active cookies."""
137 self.local_storage: dict[str, dict[str, str]] = {}
138 """localStorage from the last browser context (goto run)."""
140 # Низкоуровневый HTTP
141 self._curl: Optional[cffi_requests.AsyncSession] = None
143 # Браузерный движок — через мастер (всегда отдаёт Browser)
144 self._bm: BrowserMaster = BrowserMaster(
145 engine=self.browser_name,
146 stealth=self.playwright_stealth,
147 launch_opts=self._make_browser_launch_opts(), # первичный снапшот
148 )
150 # ──────────────── Launch args & proxy helpers ────────────────
151 def _make_browser_launch_opts(self) -> dict[str, Any]:
152 """
153 Merges launch arguments for BrowserMaster from Session settings.
155 Sources:
156 - self.browser_launch_opts (arbitrary keys)
157 - self.headless (overrides the key of the same name)
158 - self.proxy (URL string or dict) → converted to Playwright-style proxy
159 """
160 opts = dict(self.browser_launch_opts)
161 opts["headless"] = bool(self.headless)
163 pw_proxy = ParsedProxy.from_any(self.proxy)
164 if pw_proxy is not None:
165 opts["proxy"] = pw_proxy.for_playwright()
167 return opts
169 # ────── HTTP через curl_cffi ──────
170 async def request(
171 self,
172 method: HttpMethod | str,
173 url: str,
174 *,
175 headers: Optional[Mapping[str, str]] = None,
176 retry: int | None = None,
177 **kwargs: Any,
178 ) -> Response:
179 """
180 Standard fast request via curl_cffi.
181 You must provide either an HttpMethod or its string representation, as well as a URL.
183 Optionally, you can pass additional headers.
185 Extra parameters can be passed through **kwargs to curl_cffi.AsyncSession.request
186 (see their documentation for details).
187 Retries are performed ONLY on cffi Timeout: ``curl_cffi.requests.exceptions.Timeout``.
188 """
189 method_enum = method if isinstance(method, HttpMethod) else HttpMethod[str(method).upper()]
190 base_headers = {k.lower(): v for k, v in (headers or {}).items()}
192 # lazy curl session
193 if self._curl is None:
194 self._curl = cffi_requests.AsyncSession()
196 curl = self._curl
197 assert curl is not None # для mypy: ниже уже не union
199 # spoof UA / headers
200 imper_profile = self.spoof.choose(self.browser_name)
201 base_headers.update(self.spoof.forge_headers(imper_profile))
203 # Cookie header (фиксируем один раз на первую попытку)
204 url_parts = urlsplit(url)
205 cookie_header, sent_cookies = compose_cookie_header(
206 url_parts, base_headers, list(self.cookies)
207 )
208 if cookie_header:
209 base_headers["cookie"] = cookie_header
211 # proxies по умолчанию из Session.proxy, если пользователь не передал свои
212 pp_user_proxies = ParsedProxy.from_any(kwargs.pop("proxy", None))
213 user_proxies = None
214 if pp_user_proxies:
215 user_proxies = pp_user_proxies.for_curl()
217 pp_default_proxies = ParsedProxy.from_any(self.proxy)
218 default_proxies = None
219 if pp_default_proxies:
220 default_proxies = pp_default_proxies.for_curl()
222 attempts_left = self.direct_retry if retry is None else int(retry)
223 last_err: Exception | None = None
225 async def _do_request() -> tuple[Any, float]:
226 req_headers = dict(base_headers) # копия на попытку
227 t0 = perf_counter()
228 r = await curl.request(
229 method_enum.value,
230 url,
231 headers=req_headers,
232 impersonate=cast( # сузить тип до Literal набора curl_cffi
233 "cffi_requests.impersonate.BrowserTypeLiteral", imper_profile
234 ),
235 timeout=self.timeout,
236 proxy=user_proxies if user_proxies is not None else default_proxies,
237 **kwargs,
238 )
239 duration = perf_counter() - t0
240 return r, duration
242 # первая попытка + мягкие повторы на Timeout
243 try:
244 r, duration = await _do_request()
245 except cffi_requests.exceptions.Timeout as e:
246 last_err = e
247 while attempts_left > 0:
248 attempts_left -= 1
249 try:
250 r, duration = await _do_request()
251 last_err = None
252 break
253 except cffi_requests.exceptions.Timeout as e2:
254 last_err = e2
255 if last_err is not None:
256 raise last_err
258 # response → cookies
259 resp_headers = {k.lower(): v for k, v in r.headers.items()}
260 raw_sc = collect_set_cookie_headers(r.headers)
261 resp_cookies = parse_set_cookie(raw_sc, url_parts.hostname or "")
262 self.cookies.add(resp_cookies)
264 charset = guess_encoding(resp_headers)
265 body_text = r.content.decode(charset, errors="replace")
267 data = kwargs.get("data")
268 json_body = kwargs.get("json")
269 files = kwargs.get("files")
271 # models
272 req_model = Request(
273 method=method_enum,
274 url=URL(full_url=url),
275 headers=dict(base_headers),
276 body=data or json_body or files or None,
277 cookies=sent_cookies,
278 )
279 resp_model = Response(
280 request=req_model,
281 url=URL(full_url=str(r.url)),
282 headers=resp_headers,
283 cookies=resp_cookies,
284 body=body_text,
285 status_code=r.status_code,
286 duration=duration,
287 _render_callable=self._render_response,
288 )
289 return resp_model
291 # ────── browser nav ──────
292 @asynccontextmanager
293 async def goto_page(
294 self,
295 url: str,
296 *,
297 wait_until: Literal["commit", "load", "domcontentloaded", "networkidle"] = "commit",
298 retry: int | None = None,
299 ) -> AsyncGenerator[Page, None]:
300 """
301 Opens a page in the browser using a one-time context.
302 Retries perform a "soft reload" without recreating the context.
303 """
304 # Обновляем launch-аргументы в мастере перед стартом
305 self._bm.launch_opts = self._make_browser_launch_opts()
306 await self._bm.start()
308 storage_state = build_storage_state_for_context(
309 local_storage=self.local_storage,
310 cookie_manager=self.cookies,
311 )
312 ctx = await self._bm.new_context(storage_state=storage_state)
313 page = await ctx.new_page()
314 timeout_ms = int(self.timeout * 1000)
315 attempts_left = self.page_retry if retry is None else int(retry)
317 try:
318 await handle_nav_with_retries(
319 page,
320 target_url=url,
321 wait_until=wait_until,
322 timeout_ms=timeout_ms,
323 attempts=attempts_left,
324 on_retry=None,
325 )
326 yield page
327 finally:
328 self.local_storage = await merge_storage_state_from_context(
329 ctx, cookie_manager=self.cookies
330 )
331 await page.close()
332 await ctx.close()
334 # ────── Offline render ──────
335 @asynccontextmanager
336 async def _render_response(
337 self,
338 response: Response,
339 *,
340 wait_until: Literal["load", "domcontentloaded", "networkidle"] = "domcontentloaded",
341 retry: int | None = None,
342 ) -> AsyncGenerator[Page, None]:
343 """
344 Offline render of a Response: creates a temporary context (with our storage_state),
345 intercepts the first request and responds with the prepared body.
346 Retries do not recreate the context/page — instead a "soft reload" is performed,
347 reattaching the route on retry.
348 """
349 # Обновляем launch-аргументы в мастере перед стартом
350 self._bm.launch_opts = self._make_browser_launch_opts()
351 await self._bm.start()
353 storage_state = build_storage_state_for_context(
354 local_storage=self.local_storage,
355 cookie_manager=self.cookies,
356 )
357 ctx: BrowserContext = await self._bm.new_context(storage_state=cast(Any, storage_state))
358 timeout_ms = int(self.timeout * 1000)
359 attempts_left = self.page_retry if retry is None else int(retry)
361 async def _attach_route_once() -> None:
362 await ctx.unroute("**/*")
364 async def handler(route: Route, _req: PWRequest) -> None:
365 await route.fulfill(
366 status=response.status_code,
367 headers=dict(response.headers),
368 body=response.body.encode("utf-8"),
369 )
371 await ctx.route("**/*", handler, times=1)
373 await _attach_route_once()
374 page = await ctx.new_page()
376 try:
378 async def _on_retry() -> None:
379 await _attach_route_once()
381 await handle_nav_with_retries(
382 page,
383 target_url=response.url.full_url,
384 wait_until=wait_until,
385 timeout_ms=timeout_ms,
386 attempts=attempts_left,
387 on_retry=_on_retry,
388 )
389 yield page
390 finally:
391 self.local_storage = await merge_storage_state_from_context(
392 ctx, cookie_manager=self.cookies
393 )
394 await page.close()
395 await ctx.close()
397 # ────── cleanup ──────
398 async def close(self) -> None:
399 # Закрываем браузерные движки
400 await self._bm.close()
401 # Закрываем HTTP-сессию
402 if self._curl:
403 await self._curl.close()
404 self._curl = None
406 # поддержка «async with»
407 async def __aenter__(self) -> "Session":
408 return self
410 async def __aexit__(
411 self,
412 exc_type: Optional[type[BaseException]],
413 exc: Optional[BaseException],
414 tb: Optional[TracebackType],
415 ) -> None:
416 await self.close()