Coverage for human_requests / human_page.py: 68%
161 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-25 10:02 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-25 10:02 +0000
1from __future__ import annotations
3import base64
4import json
5import time
6from pathlib import Path
7from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast
8from urllib.parse import urlsplit
10from playwright.async_api import Cookie, Page
11from playwright.async_api import Response as PWResponse
12from playwright.async_api import TimeoutError as PlaywrightTimeoutError
13from selectolax.parser import HTMLParser
14from typing_extensions import override
16from .abstraction.http import URL, HttpMethod
17from .abstraction.request import FetchRequest
18from .abstraction.response import FetchResponse
20if TYPE_CHECKING:
21 from .human_context import HumanContext
24class HumanPage(Page):
25 """
26 A thin, type-compatible wrapper over Playwright's Page.
27 """
29 # ---------- core identity ----------
31 @property
32 @override
33 def context(self) -> "HumanContext":
34 # рантайм остаётся прежним; только уточняем тип
35 return cast("HumanContext", super().context)
37 @staticmethod
38 def replace(playwright_page: Page) -> HumanPage:
39 """Подменяет стандартный Playwright класс с сохранением содержимого."""
40 from .human_context import HumanContext # avoid circular import
42 if isinstance(playwright_page.context, HumanContext) is False:
43 raise TypeError("The provided Page's context is not a HumanContext")
45 playwright_page.__class__ = HumanPage
46 return playwright_page # type: ignore[return-value]
48 # ---------- lifecycle / sync ----------
50 @override
51 async def goto(
52 self,
53 url: str,
54 *,
55 retry: Optional[int] = None,
56 on_retry: Optional[Callable[[], Awaitable[None]]] = None,
57 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs):
58 **kwargs: Any,
59 ) -> Optional[PWResponse]:
60 """
61 Navigate to `url` with optional retry-on-timeout.
63 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to
64 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings.
65 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach
66 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent.
68 Parameters
69 ----------
70 url : str
71 Absolute URL to navigate to.
72 retry : int | None, optional
73 Number of soft reload attempts after a timeout (0 means no retries).
74 If None, defaults to `session.page_retry`.
75 on_retry : Callable[[], Awaitable[None]] | None, optional
76 Async hook called before each retry; use it to re-register any one-shot
77 event handlers or routes needed for the next attempt.
78 timeout : float | None, optional
79 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`.
80 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional
81 When to consider the navigation successful (forwarded to Playwright).
82 referer : str | None, optional
83 Per-request `Referer` header
84 (overrides headers set via `page.set_extra_http_headers()`).
85 **kwargs : Any
86 Any additional keyword arguments are forwarded to Playwright's `Page.goto`.
88 Returns
89 -------
90 playwright.async_api.Response | None
91 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations.
93 Raises
94 ------
95 playwright.async_api.TimeoutError
96 If the initial navigation and all retries time out.
97 Any other exceptions from `Page.goto` / `Page.reload` may also propagate.
99 Notes
100 -----
101 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent
102 across attempts.
103 - Because one-shot handlers are consumed after a failed attempt, always re-attach them
104 inside `on_retry` if the navigation logic depends on them.
105 """
106 # Build the kwargs for the underlying goto/reload calls:
107 try:
108 return await super().goto(url, **kwargs)
109 except PlaywrightTimeoutError as last_err:
110 attempts_left = (
111 int(retry) + 1 if retry is not None else 1
112 ) # +1 т.к. первый запрос базис
113 while attempts_left > 0:
114 attempts_left -= 1
115 if on_retry is not None:
116 await on_retry()
117 try:
118 # Soft refresh with the SAME wait_until/timeout
119 await super().reload(
120 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs}
121 )
122 last_err = None
123 break
124 except PlaywrightTimeoutError as e:
125 last_err = e
126 if last_err is not None:
127 raise last_err
129 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]:
130 """
131 Перехватывает первый навигационный запрос main-frame к target_url и
132 отдаёт синтетический ответ, затем делает обычный page.goto(...).
133 Возвращает Optional[PWResponse] как и goto.
134 """
136 # -------- helpers (локально и коротко) ---------------------------------
137 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes:
138 return (
139 data
140 if isinstance(data, bytes)
141 else (
142 bytes(data)
143 if isinstance(data, (bytearray, memoryview))
144 else data.encode("utf-8", "replace")
145 )
146 )
148 def _is_html(b: bytes) -> bool:
149 s = b[:512].lstrip().lower()
150 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s
152 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]:
153 if isinstance(first, FetchResponse):
154 url = first.url.full_url
155 body = _to_bytes(first.raw or b"")
156 code = int(first.status_code)
157 hdrs = dict(first.headers or {})
158 else:
159 url = str(first)
160 if "body" not in goto_kwargs:
161 raise TypeError("goto_render(url=..., *, body=...) is required")
162 body = _to_bytes(goto_kwargs.pop("body"))
163 code = int(goto_kwargs.pop("status_code", 200))
164 hdrs = dict(goto_kwargs.pop("headers", {}) or {})
165 # убрать транспортные, поставить content-type при html
166 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"}
167 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop}
168 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body):
169 clean["content-type"] = "text/html; charset=utf-8"
170 return url, body, code, clean
172 # Переназначим ретраи до того, как их прочитает goto
173 retry = goto_kwargs.pop("retry", None)
174 on_retry = goto_kwargs.pop("on_retry", None)
176 target_url, raw, status_code, headers = _norm_args()
177 page = self
178 main_frame = page.main_frame
179 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl()
181 handled = False
182 installed = False
184 def _match(req) -> bool:
185 if (
186 req.frame is not main_frame
187 or not req.is_navigation_request()
188 or req.resource_type != "document"
189 ):
190 return False
191 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash
193 async def handler(route, request):
194 nonlocal handled, installed
195 if handled or not _match(request):
196 return await route.continue_()
197 handled = True
198 await route.fulfill(status=status_code, headers=headers, body=raw)
199 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже.
200 await page.unroute(target_url, handler)
201 installed = False
203 async def _install():
204 nonlocal installed
205 if installed:
206 await page.unroute(target_url, handler)
207 await page.route(target_url, handler)
208 installed = True
210 await _install()
212 async def _on_retry_wrapper():
213 await _install()
214 if on_retry:
215 await on_retry()
217 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт
218 # — поднимем обе ошибки как группу
219 nav_exc: Exception | None = None
220 res: Optional[PWResponse] = None
221 try:
222 res = await page.goto(
223 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs
224 )
225 except Exception as e:
226 nav_exc = e
227 finally:
228 unroute_exc: Exception | None = None
229 if installed:
230 try:
231 await page.unroute(target_url, handler)
232 except Exception as e:
233 unroute_exc = e
234 if nav_exc and unroute_exc:
235 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc))
236 if nav_exc:
237 raise nav_exc
238 if unroute_exc:
239 raise unroute_exc
241 return res
243 async def fetch(
244 self,
245 url: str,
246 *,
247 method: HttpMethod = HttpMethod.GET,
248 headers: Optional[dict[str, str]] = None,
249 body: Optional[str | list | dict] = None,
250 credentials: Literal["omit", "same-origin", "include"] = "include",
251 mode: Literal["cors", "no-cors", "same-origin"] = "cors",
252 redirect: Literal["follow", "error", "manual"] = "follow",
253 referrer: Optional[str] = None,
254 timeout_ms: int = 30000,
255 ) -> FetchResponse:
256 """
257 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel.
258 • Без route / wait_for_event.
259 • raw — ВСЕГДА распакованные байты (если тело доступно JS).
260 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS.
261 """
262 declared_headers = {k.lower(): v for k, v in (headers or {}).items()}
263 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"}
264 js_ref = referrer or declared_headers.get("referer")
266 js_body: Any = body
267 if isinstance(body, (dict, list)):
268 js_body = json.dumps(body, ensure_ascii=False)
269 js_headers["content-type"] = "application/json"
271 start_t = time.perf_counter()
273 _JS_PATH = Path(__file__).parent / "fetch.js"
274 JS_FETCH = _JS_PATH.read_text(encoding="utf-8")
276 result = await self.evaluate(
277 JS_FETCH,
278 dict(
279 url=url,
280 method=method.value,
281 headers=js_headers or {},
282 body=js_body,
283 credentials=credentials,
284 mode=mode,
285 redirect=redirect,
286 ref=js_ref,
287 timeoutMs=timeout_ms,
288 ),
289 )
291 if not result.get("ok"):
292 raise RuntimeError(f"fetch failed: {result.get('error')}")
294 # bytes в raw: распакованные (если body доступен)
295 b64 = result.get("bodyB64")
296 raw = base64.b64decode(b64) if isinstance(b64, str) else b""
298 # Нормализуем заголовки: если raw есть,
299 # уберём transport-атрибуты, чтобы не путать потребителя
300 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()}
301 if raw:
302 resp_headers.pop("content-encoding", None)
303 resp_headers.pop("content-length", None)
305 req_model = FetchRequest(
306 page=self,
307 method=method,
308 url=URL(full_url=url),
309 headers=declared_headers,
310 body=body,
311 )
313 duration = time.perf_counter() - start_t
314 end_epoch = time.time()
316 resp_model = FetchResponse(
317 page=self,
318 request=req_model,
319 url=URL(full_url=result.get("finalUrl") or url),
320 headers=resp_headers,
321 raw=raw, # всегда bytes; пусто если CORS не дал читать тело
322 status_code=int(result.get("status", 0)),
323 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")),
324 redirected=bool(result.get("redirected", False)),
325 type=result.get("type", False),
326 duration=duration,
327 end_time=end_epoch,
328 )
329 return resp_model
331 @property
332 def origin(self) -> str:
333 url_parts = urlsplit(self.url)
334 return f"{url_parts.scheme}://{url_parts.netloc}"
336 async def cookies(self) -> List[Cookie]:
337 """BrowserContext.cookies
339 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`.
341 Returns
342 -------
343 List[{
344 name: str,
345 value: str,
346 domain: str,
347 path: str,
348 expires: float,
349 httpOnly: bool,
350 secure: bool,
351 sameSite: Union["Lax", "None", "Strict"],
352 partitionKey: Union[str, None]
353 }]
354 """
355 return await self.context.cookies([self.url])
357 async def local_storage(self, **kwargs) -> dict[str, str]:
358 ls = await self.context.local_storage(**kwargs)
359 return ls.get(self.origin, {})
361 async def session_storage(self) -> dict[str, str]:
362 return await self.evaluate(
363 """
364 (which) => {
365 try {
366 const s = (which in window) ? window[which] : null;
367 if (!s) return null;
368 return s;
369 } catch (_) {
370 return null;
371 }
372 }
373 """,
374 "sessionStorage",
375 )
377 async def json(self) -> list | dict:
378 """
379 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre),
380 сереализует и выдает его.
381 """
382 body = await self.content()
383 tree = HTMLParser(body)
385 node = tree.css_first("body > pre") # точный селектор "body > pre"
386 if node is None:
387 raise RuntimeError("Содержимое страницы не является json-контейнером")
389 return json.loads(node.text())
391 def __repr__(self) -> str:
392 return f"<HumanPage wrapping {super().__repr__()!r}>"