Coverage for human_requests/human_page.py: 72%
176 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-28 00:39 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-28 00:39 +0000
1from __future__ import annotations
3import base64
4import json
5import time
6from dataclasses import dataclass
7from pathlib import Path
8from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast
9from urllib.parse import urlsplit
11from playwright.async_api import Cookie, Page
12from playwright.async_api import Response as PWResponse
13from playwright.async_api import TimeoutError as PlaywrightTimeoutError
14from selectolax.parser import HTMLParser
15from typing_extensions import override
17from .abstraction.http import URL, HttpMethod
18from .abstraction.json_debug import loads_json_debug
19from .abstraction.request import FetchRequest
20from .abstraction.response import FetchResponse
21from .tools import auto_wrap_methods, make_screenshot
23if TYPE_CHECKING:
24 from .human_context import HumanContext
27@dataclass
28@auto_wrap_methods(decorator=make_screenshot)
29class HumanPage(Page):
30 """
31 A thin, type-compatible wrapper over Playwright's Page.
32 """
34 on_error_screenshot_path: str = ""
35 # ---------- core identity ----------
37 @property
38 @override
39 def context(self) -> "HumanContext":
40 # рантайм остаётся прежним; только уточняем тип
41 return cast("HumanContext", super().context)
43 @staticmethod
44 def replace(playwright_page: Page) -> HumanPage:
45 """Подменяет стандартный Playwright класс с сохранением содержимого."""
46 from .human_context import HumanContext # avoid circular import
48 if isinstance(playwright_page.context, HumanContext) is False:
49 raise TypeError("The provided Page's context is not a HumanContext")
51 playwright_page.__class__ = HumanPage
52 return playwright_page # type: ignore[return-value]
54 # ---------- lifecycle / sync ----------
56 @override
57 async def goto(
58 self,
59 url: str,
60 *,
61 retry: Optional[int] = None,
62 on_retry: Optional[Callable[[], Awaitable[None]]] = None,
63 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs):
64 **kwargs: Any,
65 ) -> Optional[PWResponse]:
66 """
67 Navigate to `url` with optional retry-on-timeout.
69 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to
70 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings.
71 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach
72 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent.
74 Parameters
75 ----------
76 url : str
77 Absolute URL to navigate to.
78 retry : int | None, optional
79 Number of soft reload attempts after a timeout (0 means no retries).
80 If None, defaults to `session.page_retry`.
81 on_retry : Callable[[], Awaitable[None]] | None, optional
82 Async hook called before each retry; use it to re-register any one-shot
83 event handlers or routes needed for the next attempt.
84 timeout : float | None, optional
85 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`.
86 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional
87 When to consider the navigation successful (forwarded to Playwright).
88 referer : str | None, optional
89 Per-request `Referer` header
90 (overrides headers set via `page.set_extra_http_headers()`).
91 **kwargs : Any
92 Any additional keyword arguments are forwarded to Playwright's `Page.goto`.
94 Returns
95 -------
96 playwright.async_api.Response | None
97 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations.
99 Raises
100 ------
101 playwright.async_api.TimeoutError
102 If the initial navigation and all retries time out.
103 Any other exceptions from `Page.goto` / `Page.reload` may also propagate.
105 Notes
106 -----
107 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent
108 across attempts.
109 - Because one-shot handlers are consumed after a failed attempt, always re-attach them
110 inside `on_retry` if the navigation logic depends on them.
111 """
112 # Build the kwargs for the underlying goto/reload calls:
113 try:
114 return await super().goto(url, **kwargs)
115 except PlaywrightTimeoutError as last_err:
116 attempts_left = (
117 int(retry) + 1 if retry is not None else 1
118 ) # +1 т.к. первый запрос базис
119 while attempts_left > 0:
120 attempts_left -= 1
121 if on_retry is not None:
122 await on_retry()
123 try:
124 # Soft refresh with the SAME wait_until/timeout
125 await super().reload(
126 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs}
127 )
128 last_err = None
129 break
130 except PlaywrightTimeoutError as e:
131 last_err = e
132 if last_err is not None:
133 raise last_err
135 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]:
136 """
137 Перехватывает первый навигационный запрос main-frame к target_url и
138 отдаёт синтетический ответ, затем делает обычный page.goto(...).
139 Возвращает Optional[PWResponse] как и goto.
140 """
142 # -------- helpers (локально и коротко) ---------------------------------
143 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes:
144 return (
145 data
146 if isinstance(data, bytes)
147 else (
148 bytes(data)
149 if isinstance(data, (bytearray, memoryview))
150 else data.encode("utf-8", "replace")
151 )
152 )
154 def _is_html(b: bytes) -> bool:
155 s = b[:512].lstrip().lower()
156 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s
158 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]:
159 if isinstance(first, FetchResponse):
160 url = first.url.full_url
161 body = _to_bytes(first.raw or b"")
162 code = int(first.status_code)
163 hdrs = dict(first.headers or {})
164 else:
165 url = str(first)
166 if "body" not in goto_kwargs:
167 raise TypeError("goto_render(url=..., *, body=...) is required")
168 body = _to_bytes(goto_kwargs.pop("body"))
169 code = int(goto_kwargs.pop("status_code", 200))
170 hdrs = dict(goto_kwargs.pop("headers", {}) or {})
171 # убрать транспортные, поставить content-type при html
172 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"}
173 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop}
174 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body):
175 clean["content-type"] = "text/html; charset=utf-8"
176 return url, body, code, clean
178 # Переназначим ретраи до того, как их прочитает goto
179 retry = goto_kwargs.pop("retry", None)
180 on_retry = goto_kwargs.pop("on_retry", None)
182 target_url, raw, status_code, headers = _norm_args()
183 page = self
184 main_frame = page.main_frame
185 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl()
187 handled = False
188 installed = False
190 def _match(req) -> bool:
191 if (
192 req.frame is not main_frame
193 or not req.is_navigation_request()
194 or req.resource_type != "document"
195 ):
196 return False
197 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash
199 async def handler(route, request):
200 nonlocal handled, installed
201 if handled or not _match(request):
202 return await route.continue_()
203 handled = True
204 await route.fulfill(status=status_code, headers=headers, body=raw)
205 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже.
206 await page.unroute(target_url, handler)
207 installed = False
209 async def _install():
210 nonlocal installed
211 if installed:
212 await page.unroute(target_url, handler)
213 await page.route(target_url, handler)
214 installed = True
216 await _install()
218 async def _on_retry_wrapper():
219 await _install()
220 if on_retry:
221 await on_retry()
223 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт
224 # — поднимем обе ошибки как группу
225 nav_exc: Exception | None = None
226 res: Optional[PWResponse] = None
227 try:
228 res = await page.goto(
229 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs
230 )
231 except Exception as e:
232 nav_exc = e
233 finally:
234 unroute_exc: Exception | None = None
235 if installed:
236 try:
237 await page.unroute(target_url, handler)
238 except Exception as e:
239 unroute_exc = e
240 if nav_exc and unroute_exc:
241 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc))
242 if nav_exc:
243 raise nav_exc
244 if unroute_exc:
245 raise unroute_exc
247 return res
249 async def fetch(
250 self,
251 url: str,
252 *,
253 method: HttpMethod = HttpMethod.GET,
254 headers: Optional[dict[str, str]] = None,
255 body: Optional[str | list | dict] = None,
256 credentials: Literal["omit", "same-origin", "include"] = "include",
257 mode: Literal["cors", "no-cors", "same-origin"] = "cors",
258 redirect: Literal["follow", "error", "manual"] = "follow",
259 referrer: Optional[str] = None,
260 timeout_ms: int = 30000,
261 retry: int = 2,
262 ) -> FetchResponse:
263 """
264 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel.
265 • Без route / wait_for_event.
266 • raw — ВСЕГДА распакованные байты (если тело доступно JS).
267 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS.
268 • `retry` повторяет запрос только при timeout (AbortController по timeout_ms).
269 """
270 if retry < 0:
271 raise ValueError("retry must be >= 0")
273 declared_headers = {k.lower(): v for k, v in (headers or {}).items()}
274 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"}
275 js_ref = referrer or declared_headers.get("referer")
277 js_body: Any = body
278 if isinstance(body, (dict, list)):
279 js_body = json.dumps(body, ensure_ascii=False)
280 js_headers["content-type"] = "application/json"
282 start_t = time.perf_counter()
284 _JS_PATH = Path(__file__).parent / "fetch.js"
285 JS_FETCH = _JS_PATH.read_text(encoding="utf-8")
287 eval_payload = dict(
288 url=url,
289 method=method.value,
290 headers=js_headers or {},
291 body=js_body,
292 credentials=credentials,
293 mode=mode,
294 redirect=redirect,
295 ref=js_ref,
296 timeoutMs=timeout_ms,
297 )
299 attempts_left = retry
300 result: Any
301 while True:
302 result = await self.evaluate(JS_FETCH, eval_payload)
303 if result.get("ok"):
304 break
305 if result.get("isTimeout") and attempts_left > 0:
306 attempts_left -= 1
307 continue
308 raise RuntimeError(f"fetch failed: {result.get('error')}")
310 # bytes в raw: распакованные (если body доступен)
311 b64 = result.get("bodyB64")
312 raw = base64.b64decode(b64) if isinstance(b64, str) else b""
314 # Нормализуем заголовки: если raw есть,
315 # уберём transport-атрибуты, чтобы не путать потребителя
316 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()}
317 if raw:
318 resp_headers.pop("content-encoding", None)
319 resp_headers.pop("content-length", None)
321 req_model = FetchRequest(
322 page=self,
323 method=method,
324 url=URL(full_url=url),
325 headers=declared_headers,
326 body=body,
327 )
329 duration = time.perf_counter() - start_t
330 end_epoch = time.time()
332 resp_model = FetchResponse(
333 page=self,
334 request=req_model,
335 url=URL(full_url=result.get("finalUrl") or url),
336 headers=resp_headers,
337 raw=raw, # всегда bytes; пусто если CORS не дал читать тело
338 status_code=int(result.get("status", 0)),
339 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")),
340 redirected=bool(result.get("redirected", False)),
341 type=result.get("type", False),
342 duration=duration,
343 end_time=end_epoch,
344 )
345 return resp_model
347 @property
348 def origin(self) -> str:
349 url_parts = urlsplit(self.url)
350 return f"{url_parts.scheme}://{url_parts.netloc}"
352 async def cookies(self) -> List[Cookie]:
353 """BrowserContext.cookies
355 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`.
357 Returns
358 -------
359 List[{
360 name: str,
361 value: str,
362 domain: str,
363 path: str,
364 expires: float,
365 httpOnly: bool,
366 secure: bool,
367 sameSite: Union["Lax", "None", "Strict"],
368 partitionKey: Union[str, None]
369 }]
370 """
371 return await self.context.cookies([self.url])
373 async def local_storage(self, **kwargs) -> dict[str, str]:
374 ls = await self.context.local_storage(**kwargs)
375 return ls.get(self.origin, {})
377 async def session_storage(self) -> dict[str, str]:
378 return await self.evaluate(
379 """
380 (which) => {
381 try {
382 const s = (which in window) ? window[which] : null;
383 if (!s) return null;
384 return s;
385 } catch (_) {
386 return null;
387 }
388 }
389 """,
390 "sessionStorage",
391 )
393 async def json(self) -> list | dict:
394 """
395 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre),
396 сереализует и выдает его.
397 """
398 body = await self.content()
399 tree = HTMLParser(body)
401 node = tree.css_first("body > pre") # точный селектор "body > pre"
402 if node is None:
403 raise RuntimeError("Содержимое страницы не является json-контейнером")
405 return loads_json_debug(node.text())
407 def __repr__(self) -> str:
408 return f"<HumanPage wrapping {super().__repr__()!r}>"