Coverage for human_requests/human

1from __future__ import annotations

3import base64

4import json

5import time

6from pathlib import Path

7from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast

8from urllib.parse import urlsplit

10from playwright.async_api import Cookie, Page

11from playwright.async_api import Response as PWResponse

12from playwright.async_api import TimeoutError as PlaywrightTimeoutError

13from selectolax.parser import HTMLParser

14from typing_extensions import override

16from .abstraction.http import URL, HttpMethod

17from .abstraction.request import FetchRequest

18from .abstraction.response import FetchResponse

20if TYPE_CHECKING:

21 from .human_context import HumanContext

24class HumanPage(Page):

25 """

26 A thin, type-compatible wrapper over Playwright's Page.

27 """

29 # ---------- core identity ----------

31 @property

32 @override

33 def context(self) -> "HumanContext":

34 # рантайм остаётся прежним; только уточняем тип

35 return cast("HumanContext", super().context)

37 @staticmethod

38 def replace(playwright_page: Page) -> HumanPage:

39 """Подменяет стандартный Playwright класс с сохранением содержимого."""

40 from .human_context import HumanContext # avoid circular import

42 if isinstance(playwright_page.context, HumanContext) is False:

43 raise TypeError("The provided Page's context is not a HumanContext")

45 playwright_page.__class__ = HumanPage

46 return playwright_page # type: ignore[return-value]

48 # ---------- lifecycle / sync ----------

50 @override

51 async def goto(

52 self,

53 url: str,

54 *,

55 retry: Optional[int] = None,

56 on_retry: Optional[Callable[[], Awaitable[None]]] = None,

57 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs):

58 **kwargs: Any,

59 ) -> Optional[PWResponse]:

60 """

61 Navigate to `url` with optional retry-on-timeout.

63 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to

64 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings.

65 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach

66 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent.

68 Parameters

69 ----------

70 url : str

71 Absolute URL to navigate to.

72 retry : int | None, optional

73 Number of soft reload attempts after a timeout (0 means no retries).

74 If None, defaults to `session.page_retry`.

75 on_retry : Callable[[], Awaitable[None]] | None, optional

76 Async hook called before each retry; use it to re-register any one-shot

77 event handlers or routes needed for the next attempt.

78 timeout : float | None, optional

79 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`.

80 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional

81 When to consider the navigation successful (forwarded to Playwright).

82 referer : str | None, optional

83 Per-request `Referer` header

84 (overrides headers set via `page.set_extra_http_headers()`).

85 **kwargs : Any

86 Any additional keyword arguments are forwarded to Playwright's `Page.goto`.

88 Returns

89 -------

90 playwright.async_api.Response | None

91 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations.

93 Raises

94 ------

95 playwright.async_api.TimeoutError

96 If the initial navigation and all retries time out.

97 Any other exceptions from `Page.goto` / `Page.reload` may also propagate.

99 Notes

100 -----

101 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent

102 across attempts.

103 - Because one-shot handlers are consumed after a failed attempt, always re-attach them

104 inside `on_retry` if the navigation logic depends on them.

105 """

106 # Build the kwargs for the underlying goto/reload calls:

107 try:

108 return await super().goto(url, **kwargs)

109 except PlaywrightTimeoutError as last_err:

110 attempts_left = (

111 int(retry) + 1 if retry is not None else 1

112 ) # +1 т.к. первый запрос базис

113 while attempts_left > 0:

114 attempts_left -= 1

115 if on_retry is not None:

116 await on_retry()

117 try:

118 # Soft refresh with the SAME wait_until/timeout

119 await super().reload(

120 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs}

121 )

122 last_err = None

123 break

124 except PlaywrightTimeoutError as e:

125 last_err = e

126 if last_err is not None:

127 raise last_err

128

129 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]:

130 """

131 Перехватывает первый навигационный запрос main-frame к target_url и

132 отдаёт синтетический ответ, затем делает обычный page.goto(...).

133 Возвращает Optional[PWResponse] как и goto.

134 """

135

136 # -------- helpers (локально и коротко) ---------------------------------

137 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes:

138 return (

139 data

140 if isinstance(data, bytes)

141 else (

142 bytes(data)

143 if isinstance(data, (bytearray, memoryview))

144 else data.encode("utf-8", "replace")

145 )

146 )

147

148 def _is_html(b: bytes) -> bool:

149 s = b[:512].lstrip().lower()

150 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s

151

152 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]:

153 if isinstance(first, FetchResponse):

154 url = first.url.full_url

155 body = _to_bytes(first.raw or b"")

156 code = int(first.status_code)

157 hdrs = dict(first.headers or {})

158 else:

159 url = str(first)

160 if "body" not in goto_kwargs:

161 raise TypeError("goto_render(url=..., *, body=...) is required")

162 body = _to_bytes(goto_kwargs.pop("body"))

163 code = int(goto_kwargs.pop("status_code", 200))

164 hdrs = dict(goto_kwargs.pop("headers", {}) or {})

165 # убрать транспортные, поставить content-type при html

166 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"}

167 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop}

168 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body):

169 clean["content-type"] = "text/html; charset=utf-8"

170 return url, body, code, clean

171

172 # Переназначим ретраи до того, как их прочитает goto

173 retry = goto_kwargs.pop("retry", None)

174 on_retry = goto_kwargs.pop("on_retry", None)

175

176 target_url, raw, status_code, headers = _norm_args()

177 page = self

178 main_frame = page.main_frame

179 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl()

180

181 handled = False

182 installed = False

183

184 def _match(req) -> bool:

185 if (

186 req.frame is not main_frame

187 or not req.is_navigation_request()

188 or req.resource_type != "document"

189 ):

190 return False

191 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash

192

193 async def handler(route, request):

194 nonlocal handled, installed

195 if handled or not _match(request):

196 return await route.continue_()

197 handled = True

198 await route.fulfill(status=status_code, headers=headers, body=raw)

199 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже.

200 await page.unroute(target_url, handler)

201 installed = False

202

203 async def _install():

204 nonlocal installed

205 if installed:

206 await page.unroute(target_url, handler)

207 await page.route(target_url, handler)

208 installed = True

209

210 await _install()

211

212 async def _on_retry_wrapper():

213 await _install()

214 if on_retry:

215 await on_retry()

216

217 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт

218 # — поднимем обе ошибки как группу

219 nav_exc: Exception | None = None

220 res: Optional[PWResponse] = None

221 try:

222 res = await page.goto(

223 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs

224 )

225 except Exception as e:

226 nav_exc = e

227 finally:

228 unroute_exc: Exception | None = None

229 if installed:

230 try:

231 await page.unroute(target_url, handler)

232 except Exception as e:

233 unroute_exc = e

234 if nav_exc and unroute_exc:

235 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc))

236 if nav_exc:

237 raise nav_exc

238 if unroute_exc:

239 raise unroute_exc

240

241 return res

242

243 async def fetch(

244 self,

245 url: str,

246 *,

247 method: HttpMethod = HttpMethod.GET,

248 headers: Optional[dict[str, str]] = None,

249 body: Optional[str | list | dict] = None,

250 credentials: Literal["omit", "same-origin", "include"] = "include",

251 mode: Literal["cors", "no-cors", "same-origin"] = "cors",

252 redirect: Literal["follow", "error", "manual"] = "follow",

253 referrer: Optional[str] = None,

254 timeout_ms: int = 30000,

255 ) -> FetchResponse:

256 """

257 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel.

258 • Без route / wait_for_event.

259 • raw — ВСЕГДА распакованные байты (если тело доступно JS).

260 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS.

261 """

262 declared_headers = {k.lower(): v for k, v in (headers or {}).items()}

263 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"}

264 js_ref = referrer or declared_headers.get("referer")

265

266 js_body: Any = body

267 if isinstance(body, (dict, list)):

268 js_body = json.dumps(body, ensure_ascii=False)

269 js_headers["content-type"] = "application/json"

270

271 start_t = time.perf_counter()

272

273 _JS_PATH = Path(__file__).parent / "fetch.js"

274 JS_FETCH = _JS_PATH.read_text(encoding="utf-8")

275

276 result = await self.evaluate(

277 JS_FETCH,

278 dict(

279 url=url,

280 method=method.value,

281 headers=js_headers or {},

282 body=js_body,

283 credentials=credentials,

284 mode=mode,

285 redirect=redirect,

286 ref=js_ref,

287 timeoutMs=timeout_ms,

288 ),

289 )

290

291 if not result.get("ok"):

292 raise RuntimeError(f"fetch failed: {result.get('error')}")

293

294 # bytes в raw: распакованные (если body доступен)

295 b64 = result.get("bodyB64")

296 raw = base64.b64decode(b64) if isinstance(b64, str) else b""

297

298 # Нормализуем заголовки: если raw есть,

299 # уберём transport-атрибуты, чтобы не путать потребителя

300 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()}

301 if raw:

302 resp_headers.pop("content-encoding", None)

303 resp_headers.pop("content-length", None)

304

305 req_model = FetchRequest(

306 page=self,

307 method=method,

308 url=URL(full_url=url),

309 headers=declared_headers,

310 body=body,

311 )

312

313 duration = time.perf_counter() - start_t

314 end_epoch = time.time()

315

316 resp_model = FetchResponse(

317 page=self,

318 request=req_model,

319 url=URL(full_url=result.get("finalUrl") or url),

320 headers=resp_headers,

321 raw=raw, # всегда bytes; пусто если CORS не дал читать тело

322 status_code=int(result.get("status", 0)),

323 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")),

324 redirected=bool(result.get("redirected", False)),

325 type=result.get("type", False),

326 duration=duration,

327 end_time=end_epoch,

328 )

329 return resp_model

330

331 @property

332 def origin(self) -> str:

333 url_parts = urlsplit(self.url)

334 return f"{url_parts.scheme}://{url_parts.netloc}"

335

336 async def cookies(self) -> List[Cookie]:

337 """BrowserContext.cookies

338

339 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`.

340

341 Returns

342 -------

343 List[{

344 name: str,

345 value: str,

346 domain: str,

347 path: str,

348 expires: float,

349 httpOnly: bool,

350 secure: bool,

351 sameSite: Union["Lax", "None", "Strict"],

352 partitionKey: Union[str, None]

353 }]

354 """

355 return await self.context.cookies([self.url])

356

357 async def local_storage(self, **kwargs) -> dict[str, str]:

358 ls = await self.context.local_storage(**kwargs)

359 return ls.get(self.origin, {})

360

361 async def session_storage(self) -> dict[str, str]:

362 return await self.evaluate(

363 """

364 (which) => {

365 try {

366 const s = (which in window) ? window[which] : null;

367 if (!s) return null;

368 return s;

369 } catch (_) {

370 return null;

371 }

372 }

373 """,

374 "sessionStorage",

375 )

376

377 async def json(self) -> list | dict:

378 """

379 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre),

380 сереализует и выдает его.

381 """

382 body = await self.content()

383 tree = HTMLParser(body)

384

385 node = tree.css_first("body > pre") # точный селектор "body > pre"

386 if node is None:

387 raise RuntimeError("Содержимое страницы не является json-контейнером")

388

389 return json.loads(node.text())

390

391 def __repr__(self) -> str:

392 return f"<HumanPage wrapping {super().__repr__()!r}>"

Coverage for human_requests / human_page.py: 68%

161 statements