Coverage for human_requests / human_page.py: 71%
175 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-07 17:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-07 17:38 +0000
1from __future__ import annotations
3import base64
4import json
5import time
6from dataclasses import dataclass
7from pathlib import Path
8from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast
9from urllib.parse import urlsplit
11from playwright.async_api import Cookie, Page
12from playwright.async_api import Response as PWResponse
13from playwright.async_api import TimeoutError as PlaywrightTimeoutError
14from selectolax.parser import HTMLParser
15from typing_extensions import override
17from .abstraction.http import URL, HttpMethod
18from .abstraction.request import FetchRequest
19from .abstraction.response import FetchResponse
20from .tools import auto_wrap_methods, make_screenshot
22if TYPE_CHECKING:
23 from .human_context import HumanContext
26@dataclass
27@auto_wrap_methods(decorator=make_screenshot)
28class HumanPage(Page):
29 """
30 A thin, type-compatible wrapper over Playwright's Page.
31 """
33 on_error_screenshot_path: str = ""
34 # ---------- core identity ----------
36 @property
37 @override
38 def context(self) -> "HumanContext":
39 # рантайм остаётся прежним; только уточняем тип
40 return cast("HumanContext", super().context)
42 @staticmethod
43 def replace(playwright_page: Page) -> HumanPage:
44 """Подменяет стандартный Playwright класс с сохранением содержимого."""
45 from .human_context import HumanContext # avoid circular import
47 if isinstance(playwright_page.context, HumanContext) is False:
48 raise TypeError("The provided Page's context is not a HumanContext")
50 playwright_page.__class__ = HumanPage
51 return playwright_page # type: ignore[return-value]
53 # ---------- lifecycle / sync ----------
55 @override
56 async def goto(
57 self,
58 url: str,
59 *,
60 retry: Optional[int] = None,
61 on_retry: Optional[Callable[[], Awaitable[None]]] = None,
62 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs):
63 **kwargs: Any,
64 ) -> Optional[PWResponse]:
65 """
66 Navigate to `url` with optional retry-on-timeout.
68 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to
69 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings.
70 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach
71 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent.
73 Parameters
74 ----------
75 url : str
76 Absolute URL to navigate to.
77 retry : int | None, optional
78 Number of soft reload attempts after a timeout (0 means no retries).
79 If None, defaults to `session.page_retry`.
80 on_retry : Callable[[], Awaitable[None]] | None, optional
81 Async hook called before each retry; use it to re-register any one-shot
82 event handlers or routes needed for the next attempt.
83 timeout : float | None, optional
84 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`.
85 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional
86 When to consider the navigation successful (forwarded to Playwright).
87 referer : str | None, optional
88 Per-request `Referer` header
89 (overrides headers set via `page.set_extra_http_headers()`).
90 **kwargs : Any
91 Any additional keyword arguments are forwarded to Playwright's `Page.goto`.
93 Returns
94 -------
95 playwright.async_api.Response | None
96 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations.
98 Raises
99 ------
100 playwright.async_api.TimeoutError
101 If the initial navigation and all retries time out.
102 Any other exceptions from `Page.goto` / `Page.reload` may also propagate.
104 Notes
105 -----
106 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent
107 across attempts.
108 - Because one-shot handlers are consumed after a failed attempt, always re-attach them
109 inside `on_retry` if the navigation logic depends on them.
110 """
111 # Build the kwargs for the underlying goto/reload calls:
112 try:
113 return await super().goto(url, **kwargs)
114 except PlaywrightTimeoutError as last_err:
115 attempts_left = (
116 int(retry) + 1 if retry is not None else 1
117 ) # +1 т.к. первый запрос базис
118 while attempts_left > 0:
119 attempts_left -= 1
120 if on_retry is not None:
121 await on_retry()
122 try:
123 # Soft refresh with the SAME wait_until/timeout
124 await super().reload(
125 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs}
126 )
127 last_err = None
128 break
129 except PlaywrightTimeoutError as e:
130 last_err = e
131 if last_err is not None:
132 raise last_err
134 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]:
135 """
136 Перехватывает первый навигационный запрос main-frame к target_url и
137 отдаёт синтетический ответ, затем делает обычный page.goto(...).
138 Возвращает Optional[PWResponse] как и goto.
139 """
141 # -------- helpers (локально и коротко) ---------------------------------
142 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes:
143 return (
144 data
145 if isinstance(data, bytes)
146 else (
147 bytes(data)
148 if isinstance(data, (bytearray, memoryview))
149 else data.encode("utf-8", "replace")
150 )
151 )
153 def _is_html(b: bytes) -> bool:
154 s = b[:512].lstrip().lower()
155 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s
157 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]:
158 if isinstance(first, FetchResponse):
159 url = first.url.full_url
160 body = _to_bytes(first.raw or b"")
161 code = int(first.status_code)
162 hdrs = dict(first.headers or {})
163 else:
164 url = str(first)
165 if "body" not in goto_kwargs:
166 raise TypeError("goto_render(url=..., *, body=...) is required")
167 body = _to_bytes(goto_kwargs.pop("body"))
168 code = int(goto_kwargs.pop("status_code", 200))
169 hdrs = dict(goto_kwargs.pop("headers", {}) or {})
170 # убрать транспортные, поставить content-type при html
171 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"}
172 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop}
173 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body):
174 clean["content-type"] = "text/html; charset=utf-8"
175 return url, body, code, clean
177 # Переназначим ретраи до того, как их прочитает goto
178 retry = goto_kwargs.pop("retry", None)
179 on_retry = goto_kwargs.pop("on_retry", None)
181 target_url, raw, status_code, headers = _norm_args()
182 page = self
183 main_frame = page.main_frame
184 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl()
186 handled = False
187 installed = False
189 def _match(req) -> bool:
190 if (
191 req.frame is not main_frame
192 or not req.is_navigation_request()
193 or req.resource_type != "document"
194 ):
195 return False
196 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash
198 async def handler(route, request):
199 nonlocal handled, installed
200 if handled or not _match(request):
201 return await route.continue_()
202 handled = True
203 await route.fulfill(status=status_code, headers=headers, body=raw)
204 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже.
205 await page.unroute(target_url, handler)
206 installed = False
208 async def _install():
209 nonlocal installed
210 if installed:
211 await page.unroute(target_url, handler)
212 await page.route(target_url, handler)
213 installed = True
215 await _install()
217 async def _on_retry_wrapper():
218 await _install()
219 if on_retry:
220 await on_retry()
222 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт
223 # — поднимем обе ошибки как группу
224 nav_exc: Exception | None = None
225 res: Optional[PWResponse] = None
226 try:
227 res = await page.goto(
228 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs
229 )
230 except Exception as e:
231 nav_exc = e
232 finally:
233 unroute_exc: Exception | None = None
234 if installed:
235 try:
236 await page.unroute(target_url, handler)
237 except Exception as e:
238 unroute_exc = e
239 if nav_exc and unroute_exc:
240 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc))
241 if nav_exc:
242 raise nav_exc
243 if unroute_exc:
244 raise unroute_exc
246 return res
248 async def fetch(
249 self,
250 url: str,
251 *,
252 method: HttpMethod = HttpMethod.GET,
253 headers: Optional[dict[str, str]] = None,
254 body: Optional[str | list | dict] = None,
255 credentials: Literal["omit", "same-origin", "include"] = "include",
256 mode: Literal["cors", "no-cors", "same-origin"] = "cors",
257 redirect: Literal["follow", "error", "manual"] = "follow",
258 referrer: Optional[str] = None,
259 timeout_ms: int = 30000,
260 retry: int = 2,
261 ) -> FetchResponse:
262 """
263 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel.
264 • Без route / wait_for_event.
265 • raw — ВСЕГДА распакованные байты (если тело доступно JS).
266 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS.
267 • `retry` повторяет запрос только при timeout (AbortController по timeout_ms).
268 """
269 if retry < 0:
270 raise ValueError("retry must be >= 0")
272 declared_headers = {k.lower(): v for k, v in (headers or {}).items()}
273 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"}
274 js_ref = referrer or declared_headers.get("referer")
276 js_body: Any = body
277 if isinstance(body, (dict, list)):
278 js_body = json.dumps(body, ensure_ascii=False)
279 js_headers["content-type"] = "application/json"
281 start_t = time.perf_counter()
283 _JS_PATH = Path(__file__).parent / "fetch.js"
284 JS_FETCH = _JS_PATH.read_text(encoding="utf-8")
286 eval_payload = dict(
287 url=url,
288 method=method.value,
289 headers=js_headers or {},
290 body=js_body,
291 credentials=credentials,
292 mode=mode,
293 redirect=redirect,
294 ref=js_ref,
295 timeoutMs=timeout_ms,
296 )
298 attempts_left = retry
299 result: Any
300 while True:
301 result = await self.evaluate(JS_FETCH, eval_payload)
302 if result.get("ok"):
303 break
304 if result.get("isTimeout") and attempts_left > 0:
305 attempts_left -= 1
306 continue
307 raise RuntimeError(f"fetch failed: {result.get('error')}")
309 # bytes в raw: распакованные (если body доступен)
310 b64 = result.get("bodyB64")
311 raw = base64.b64decode(b64) if isinstance(b64, str) else b""
313 # Нормализуем заголовки: если raw есть,
314 # уберём transport-атрибуты, чтобы не путать потребителя
315 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()}
316 if raw:
317 resp_headers.pop("content-encoding", None)
318 resp_headers.pop("content-length", None)
320 req_model = FetchRequest(
321 page=self,
322 method=method,
323 url=URL(full_url=url),
324 headers=declared_headers,
325 body=body,
326 )
328 duration = time.perf_counter() - start_t
329 end_epoch = time.time()
331 resp_model = FetchResponse(
332 page=self,
333 request=req_model,
334 url=URL(full_url=result.get("finalUrl") or url),
335 headers=resp_headers,
336 raw=raw, # всегда bytes; пусто если CORS не дал читать тело
337 status_code=int(result.get("status", 0)),
338 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")),
339 redirected=bool(result.get("redirected", False)),
340 type=result.get("type", False),
341 duration=duration,
342 end_time=end_epoch,
343 )
344 return resp_model
346 @property
347 def origin(self) -> str:
348 url_parts = urlsplit(self.url)
349 return f"{url_parts.scheme}://{url_parts.netloc}"
351 async def cookies(self) -> List[Cookie]:
352 """BrowserContext.cookies
354 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`.
356 Returns
357 -------
358 List[{
359 name: str,
360 value: str,
361 domain: str,
362 path: str,
363 expires: float,
364 httpOnly: bool,
365 secure: bool,
366 sameSite: Union["Lax", "None", "Strict"],
367 partitionKey: Union[str, None]
368 }]
369 """
370 return await self.context.cookies([self.url])
372 async def local_storage(self, **kwargs) -> dict[str, str]:
373 ls = await self.context.local_storage(**kwargs)
374 return ls.get(self.origin, {})
376 async def session_storage(self) -> dict[str, str]:
377 return await self.evaluate(
378 """
379 (which) => {
380 try {
381 const s = (which in window) ? window[which] : null;
382 if (!s) return null;
383 return s;
384 } catch (_) {
385 return null;
386 }
387 }
388 """,
389 "sessionStorage",
390 )
392 async def json(self) -> list | dict:
393 """
394 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre),
395 сереализует и выдает его.
396 """
397 body = await self.content()
398 tree = HTMLParser(body)
400 node = tree.css_first("body > pre") # точный селектор "body > pre"
401 if node is None:
402 raise RuntimeError("Содержимое страницы не является json-контейнером")
404 return json.loads(node.text())
406 def __repr__(self) -> str:
407 return f"<HumanPage wrapping {super().__repr__()!r}>"