Coverage for human_requests / human_page.py: 71%

175 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-07 17:38 +0000

1from __future__ import annotations 

2 

3import base64 

4import json 

5import time 

6from dataclasses import dataclass 

7from pathlib import Path 

8from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast 

9from urllib.parse import urlsplit 

10 

11from playwright.async_api import Cookie, Page 

12from playwright.async_api import Response as PWResponse 

13from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

14from selectolax.parser import HTMLParser 

15from typing_extensions import override 

16 

17from .abstraction.http import URL, HttpMethod 

18from .abstraction.request import FetchRequest 

19from .abstraction.response import FetchResponse 

20from .tools import auto_wrap_methods, make_screenshot 

21 

22if TYPE_CHECKING: 

23 from .human_context import HumanContext 

24 

25 

26@dataclass 

27@auto_wrap_methods(decorator=make_screenshot) 

28class HumanPage(Page): 

29 """ 

30 A thin, type-compatible wrapper over Playwright's Page. 

31 """ 

32 

33 on_error_screenshot_path: str = "" 

34 # ---------- core identity ---------- 

35 

36 @property 

37 @override 

38 def context(self) -> "HumanContext": 

39 # рантайм остаётся прежним; только уточняем тип 

40 return cast("HumanContext", super().context) 

41 

42 @staticmethod 

43 def replace(playwright_page: Page) -> HumanPage: 

44 """Подменяет стандартный Playwright класс с сохранением содержимого.""" 

45 from .human_context import HumanContext # avoid circular import 

46 

47 if isinstance(playwright_page.context, HumanContext) is False: 

48 raise TypeError("The provided Page's context is not a HumanContext") 

49 

50 playwright_page.__class__ = HumanPage 

51 return playwright_page # type: ignore[return-value] 

52 

53 # ---------- lifecycle / sync ---------- 

54 

55 @override 

56 async def goto( 

57 self, 

58 url: str, 

59 *, 

60 retry: Optional[int] = None, 

61 on_retry: Optional[Callable[[], Awaitable[None]]] = None, 

62 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs): 

63 **kwargs: Any, 

64 ) -> Optional[PWResponse]: 

65 """ 

66 Navigate to `url` with optional retry-on-timeout. 

67 

68 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to 

69 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings. 

70 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach 

71 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent. 

72 

73 Parameters 

74 ---------- 

75 url : str 

76 Absolute URL to navigate to. 

77 retry : int | None, optional 

78 Number of soft reload attempts after a timeout (0 means no retries). 

79 If None, defaults to `session.page_retry`. 

80 on_retry : Callable[[], Awaitable[None]] | None, optional 

81 Async hook called before each retry; use it to re-register any one-shot 

82 event handlers or routes needed for the next attempt. 

83 timeout : float | None, optional 

84 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`. 

85 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional 

86 When to consider the navigation successful (forwarded to Playwright). 

87 referer : str | None, optional 

88 Per-request `Referer` header 

89 (overrides headers set via `page.set_extra_http_headers()`). 

90 **kwargs : Any 

91 Any additional keyword arguments are forwarded to Playwright's `Page.goto`. 

92 

93 Returns 

94 ------- 

95 playwright.async_api.Response | None 

96 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations. 

97 

98 Raises 

99 ------ 

100 playwright.async_api.TimeoutError 

101 If the initial navigation and all retries time out. 

102 Any other exceptions from `Page.goto` / `Page.reload` may also propagate. 

103 

104 Notes 

105 ----- 

106 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent 

107 across attempts. 

108 - Because one-shot handlers are consumed after a failed attempt, always re-attach them 

109 inside `on_retry` if the navigation logic depends on them. 

110 """ 

111 # Build the kwargs for the underlying goto/reload calls: 

112 try: 

113 return await super().goto(url, **kwargs) 

114 except PlaywrightTimeoutError as last_err: 

115 attempts_left = ( 

116 int(retry) + 1 if retry is not None else 1 

117 ) # +1 т.к. первый запрос базис 

118 while attempts_left > 0: 

119 attempts_left -= 1 

120 if on_retry is not None: 

121 await on_retry() 

122 try: 

123 # Soft refresh with the SAME wait_until/timeout 

124 await super().reload( 

125 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs} 

126 ) 

127 last_err = None 

128 break 

129 except PlaywrightTimeoutError as e: 

130 last_err = e 

131 if last_err is not None: 

132 raise last_err 

133 

134 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]: 

135 """ 

136 Перехватывает первый навигационный запрос main-frame к target_url и 

137 отдаёт синтетический ответ, затем делает обычный page.goto(...). 

138 Возвращает Optional[PWResponse] как и goto. 

139 """ 

140 

141 # -------- helpers (локально и коротко) --------------------------------- 

142 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes: 

143 return ( 

144 data 

145 if isinstance(data, bytes) 

146 else ( 

147 bytes(data) 

148 if isinstance(data, (bytearray, memoryview)) 

149 else data.encode("utf-8", "replace") 

150 ) 

151 ) 

152 

153 def _is_html(b: bytes) -> bool: 

154 s = b[:512].lstrip().lower() 

155 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s 

156 

157 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]: 

158 if isinstance(first, FetchResponse): 

159 url = first.url.full_url 

160 body = _to_bytes(first.raw or b"") 

161 code = int(first.status_code) 

162 hdrs = dict(first.headers or {}) 

163 else: 

164 url = str(first) 

165 if "body" not in goto_kwargs: 

166 raise TypeError("goto_render(url=..., *, body=...) is required") 

167 body = _to_bytes(goto_kwargs.pop("body")) 

168 code = int(goto_kwargs.pop("status_code", 200)) 

169 hdrs = dict(goto_kwargs.pop("headers", {}) or {}) 

170 # убрать транспортные, поставить content-type при html 

171 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"} 

172 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop} 

173 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body): 

174 clean["content-type"] = "text/html; charset=utf-8" 

175 return url, body, code, clean 

176 

177 # Переназначим ретраи до того, как их прочитает goto 

178 retry = goto_kwargs.pop("retry", None) 

179 on_retry = goto_kwargs.pop("on_retry", None) 

180 

181 target_url, raw, status_code, headers = _norm_args() 

182 page = self 

183 main_frame = page.main_frame 

184 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl() 

185 

186 handled = False 

187 installed = False 

188 

189 def _match(req) -> bool: 

190 if ( 

191 req.frame is not main_frame 

192 or not req.is_navigation_request() 

193 or req.resource_type != "document" 

194 ): 

195 return False 

196 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash 

197 

198 async def handler(route, request): 

199 nonlocal handled, installed 

200 if handled or not _match(request): 

201 return await route.continue_() 

202 handled = True 

203 await route.fulfill(status=status_code, headers=headers, body=raw) 

204 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже. 

205 await page.unroute(target_url, handler) 

206 installed = False 

207 

208 async def _install(): 

209 nonlocal installed 

210 if installed: 

211 await page.unroute(target_url, handler) 

212 await page.route(target_url, handler) 

213 installed = True 

214 

215 await _install() 

216 

217 async def _on_retry_wrapper(): 

218 await _install() 

219 if on_retry: 

220 await on_retry() 

221 

222 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт 

223 # — поднимем обе ошибки как группу 

224 nav_exc: Exception | None = None 

225 res: Optional[PWResponse] = None 

226 try: 

227 res = await page.goto( 

228 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs 

229 ) 

230 except Exception as e: 

231 nav_exc = e 

232 finally: 

233 unroute_exc: Exception | None = None 

234 if installed: 

235 try: 

236 await page.unroute(target_url, handler) 

237 except Exception as e: 

238 unroute_exc = e 

239 if nav_exc and unroute_exc: 

240 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc)) 

241 if nav_exc: 

242 raise nav_exc 

243 if unroute_exc: 

244 raise unroute_exc 

245 

246 return res 

247 

248 async def fetch( 

249 self, 

250 url: str, 

251 *, 

252 method: HttpMethod = HttpMethod.GET, 

253 headers: Optional[dict[str, str]] = None, 

254 body: Optional[str | list | dict] = None, 

255 credentials: Literal["omit", "same-origin", "include"] = "include", 

256 mode: Literal["cors", "no-cors", "same-origin"] = "cors", 

257 redirect: Literal["follow", "error", "manual"] = "follow", 

258 referrer: Optional[str] = None, 

259 timeout_ms: int = 30000, 

260 retry: int = 2, 

261 ) -> FetchResponse: 

262 """ 

263 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel. 

264 • Без route / wait_for_event. 

265 • raw — ВСЕГДА распакованные байты (если тело доступно JS). 

266 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS. 

267 • `retry` повторяет запрос только при timeout (AbortController по timeout_ms). 

268 """ 

269 if retry < 0: 

270 raise ValueError("retry must be >= 0") 

271 

272 declared_headers = {k.lower(): v for k, v in (headers or {}).items()} 

273 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"} 

274 js_ref = referrer or declared_headers.get("referer") 

275 

276 js_body: Any = body 

277 if isinstance(body, (dict, list)): 

278 js_body = json.dumps(body, ensure_ascii=False) 

279 js_headers["content-type"] = "application/json" 

280 

281 start_t = time.perf_counter() 

282 

283 _JS_PATH = Path(__file__).parent / "fetch.js" 

284 JS_FETCH = _JS_PATH.read_text(encoding="utf-8") 

285 

286 eval_payload = dict( 

287 url=url, 

288 method=method.value, 

289 headers=js_headers or {}, 

290 body=js_body, 

291 credentials=credentials, 

292 mode=mode, 

293 redirect=redirect, 

294 ref=js_ref, 

295 timeoutMs=timeout_ms, 

296 ) 

297 

298 attempts_left = retry 

299 result: Any 

300 while True: 

301 result = await self.evaluate(JS_FETCH, eval_payload) 

302 if result.get("ok"): 

303 break 

304 if result.get("isTimeout") and attempts_left > 0: 

305 attempts_left -= 1 

306 continue 

307 raise RuntimeError(f"fetch failed: {result.get('error')}") 

308 

309 # bytes в raw: распакованные (если body доступен) 

310 b64 = result.get("bodyB64") 

311 raw = base64.b64decode(b64) if isinstance(b64, str) else b"" 

312 

313 # Нормализуем заголовки: если raw есть, 

314 # уберём transport-атрибуты, чтобы не путать потребителя 

315 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()} 

316 if raw: 

317 resp_headers.pop("content-encoding", None) 

318 resp_headers.pop("content-length", None) 

319 

320 req_model = FetchRequest( 

321 page=self, 

322 method=method, 

323 url=URL(full_url=url), 

324 headers=declared_headers, 

325 body=body, 

326 ) 

327 

328 duration = time.perf_counter() - start_t 

329 end_epoch = time.time() 

330 

331 resp_model = FetchResponse( 

332 page=self, 

333 request=req_model, 

334 url=URL(full_url=result.get("finalUrl") or url), 

335 headers=resp_headers, 

336 raw=raw, # всегда bytes; пусто если CORS не дал читать тело 

337 status_code=int(result.get("status", 0)), 

338 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")), 

339 redirected=bool(result.get("redirected", False)), 

340 type=result.get("type", False), 

341 duration=duration, 

342 end_time=end_epoch, 

343 ) 

344 return resp_model 

345 

346 @property 

347 def origin(self) -> str: 

348 url_parts = urlsplit(self.url) 

349 return f"{url_parts.scheme}://{url_parts.netloc}" 

350 

351 async def cookies(self) -> List[Cookie]: 

352 """BrowserContext.cookies 

353 

354 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`. 

355 

356 Returns 

357 ------- 

358 List[{ 

359 name: str, 

360 value: str, 

361 domain: str, 

362 path: str, 

363 expires: float, 

364 httpOnly: bool, 

365 secure: bool, 

366 sameSite: Union["Lax", "None", "Strict"], 

367 partitionKey: Union[str, None] 

368 }] 

369 """ 

370 return await self.context.cookies([self.url]) 

371 

372 async def local_storage(self, **kwargs) -> dict[str, str]: 

373 ls = await self.context.local_storage(**kwargs) 

374 return ls.get(self.origin, {}) 

375 

376 async def session_storage(self) -> dict[str, str]: 

377 return await self.evaluate( 

378 """ 

379 (which) => { 

380 try { 

381 const s = (which in window) ? window[which] : null; 

382 if (!s) return null; 

383 return s; 

384 } catch (_) { 

385 return null; 

386 } 

387 } 

388 """, 

389 "sessionStorage", 

390 ) 

391 

392 async def json(self) -> list | dict: 

393 """ 

394 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre), 

395 сереализует и выдает его. 

396 """ 

397 body = await self.content() 

398 tree = HTMLParser(body) 

399 

400 node = tree.css_first("body > pre") # точный селектор "body > pre" 

401 if node is None: 

402 raise RuntimeError("Содержимое страницы не является json-контейнером") 

403 

404 return json.loads(node.text()) 

405 

406 def __repr__(self) -> str: 

407 return f"<HumanPage wrapping {super().__repr__()!r}>"