Coverage for human_requests/human_page.py: 72%

176 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-05-28 00:39 +0000

1from __future__ import annotations 

2 

3import base64 

4import json 

5import time 

6from dataclasses import dataclass 

7from pathlib import Path 

8from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast 

9from urllib.parse import urlsplit 

10 

11from playwright.async_api import Cookie, Page 

12from playwright.async_api import Response as PWResponse 

13from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

14from selectolax.parser import HTMLParser 

15from typing_extensions import override 

16 

17from .abstraction.http import URL, HttpMethod 

18from .abstraction.json_debug import loads_json_debug 

19from .abstraction.request import FetchRequest 

20from .abstraction.response import FetchResponse 

21from .tools import auto_wrap_methods, make_screenshot 

22 

23if TYPE_CHECKING: 

24 from .human_context import HumanContext 

25 

26 

27@dataclass 

28@auto_wrap_methods(decorator=make_screenshot) 

29class HumanPage(Page): 

30 """ 

31 A thin, type-compatible wrapper over Playwright's Page. 

32 """ 

33 

34 on_error_screenshot_path: str = "" 

35 # ---------- core identity ---------- 

36 

37 @property 

38 @override 

39 def context(self) -> "HumanContext": 

40 # рантайм остаётся прежним; только уточняем тип 

41 return cast("HumanContext", super().context) 

42 

43 @staticmethod 

44 def replace(playwright_page: Page) -> HumanPage: 

45 """Подменяет стандартный Playwright класс с сохранением содержимого.""" 

46 from .human_context import HumanContext # avoid circular import 

47 

48 if isinstance(playwright_page.context, HumanContext) is False: 

49 raise TypeError("The provided Page's context is not a HumanContext") 

50 

51 playwright_page.__class__ = HumanPage 

52 return playwright_page # type: ignore[return-value] 

53 

54 # ---------- lifecycle / sync ---------- 

55 

56 @override 

57 async def goto( 

58 self, 

59 url: str, 

60 *, 

61 retry: Optional[int] = None, 

62 on_retry: Optional[Callable[[], Awaitable[None]]] = None, 

63 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs): 

64 **kwargs: Any, 

65 ) -> Optional[PWResponse]: 

66 """ 

67 Navigate to `url` with optional retry-on-timeout. 

68 

69 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to 

70 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings. 

71 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach 

72 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent. 

73 

74 Parameters 

75 ---------- 

76 url : str 

77 Absolute URL to navigate to. 

78 retry : int | None, optional 

79 Number of soft reload attempts after a timeout (0 means no retries). 

80 If None, defaults to `session.page_retry`. 

81 on_retry : Callable[[], Awaitable[None]] | None, optional 

82 Async hook called before each retry; use it to re-register any one-shot 

83 event handlers or routes needed for the next attempt. 

84 timeout : float | None, optional 

85 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`. 

86 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional 

87 When to consider the navigation successful (forwarded to Playwright). 

88 referer : str | None, optional 

89 Per-request `Referer` header 

90 (overrides headers set via `page.set_extra_http_headers()`). 

91 **kwargs : Any 

92 Any additional keyword arguments are forwarded to Playwright's `Page.goto`. 

93 

94 Returns 

95 ------- 

96 playwright.async_api.Response | None 

97 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations. 

98 

99 Raises 

100 ------ 

101 playwright.async_api.TimeoutError 

102 If the initial navigation and all retries time out. 

103 Any other exceptions from `Page.goto` / `Page.reload` may also propagate. 

104 

105 Notes 

106 ----- 

107 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent 

108 across attempts. 

109 - Because one-shot handlers are consumed after a failed attempt, always re-attach them 

110 inside `on_retry` if the navigation logic depends on them. 

111 """ 

112 # Build the kwargs for the underlying goto/reload calls: 

113 try: 

114 return await super().goto(url, **kwargs) 

115 except PlaywrightTimeoutError as last_err: 

116 attempts_left = ( 

117 int(retry) + 1 if retry is not None else 1 

118 ) # +1 т.к. первый запрос базис 

119 while attempts_left > 0: 

120 attempts_left -= 1 

121 if on_retry is not None: 

122 await on_retry() 

123 try: 

124 # Soft refresh with the SAME wait_until/timeout 

125 await super().reload( 

126 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs} 

127 ) 

128 last_err = None 

129 break 

130 except PlaywrightTimeoutError as e: 

131 last_err = e 

132 if last_err is not None: 

133 raise last_err 

134 

135 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]: 

136 """ 

137 Перехватывает первый навигационный запрос main-frame к target_url и 

138 отдаёт синтетический ответ, затем делает обычный page.goto(...). 

139 Возвращает Optional[PWResponse] как и goto. 

140 """ 

141 

142 # -------- helpers (локально и коротко) --------------------------------- 

143 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes: 

144 return ( 

145 data 

146 if isinstance(data, bytes) 

147 else ( 

148 bytes(data) 

149 if isinstance(data, (bytearray, memoryview)) 

150 else data.encode("utf-8", "replace") 

151 ) 

152 ) 

153 

154 def _is_html(b: bytes) -> bool: 

155 s = b[:512].lstrip().lower() 

156 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s 

157 

158 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]: 

159 if isinstance(first, FetchResponse): 

160 url = first.url.full_url 

161 body = _to_bytes(first.raw or b"") 

162 code = int(first.status_code) 

163 hdrs = dict(first.headers or {}) 

164 else: 

165 url = str(first) 

166 if "body" not in goto_kwargs: 

167 raise TypeError("goto_render(url=..., *, body=...) is required") 

168 body = _to_bytes(goto_kwargs.pop("body")) 

169 code = int(goto_kwargs.pop("status_code", 200)) 

170 hdrs = dict(goto_kwargs.pop("headers", {}) or {}) 

171 # убрать транспортные, поставить content-type при html 

172 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"} 

173 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop} 

174 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body): 

175 clean["content-type"] = "text/html; charset=utf-8" 

176 return url, body, code, clean 

177 

178 # Переназначим ретраи до того, как их прочитает goto 

179 retry = goto_kwargs.pop("retry", None) 

180 on_retry = goto_kwargs.pop("on_retry", None) 

181 

182 target_url, raw, status_code, headers = _norm_args() 

183 page = self 

184 main_frame = page.main_frame 

185 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl() 

186 

187 handled = False 

188 installed = False 

189 

190 def _match(req) -> bool: 

191 if ( 

192 req.frame is not main_frame 

193 or not req.is_navigation_request() 

194 or req.resource_type != "document" 

195 ): 

196 return False 

197 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash 

198 

199 async def handler(route, request): 

200 nonlocal handled, installed 

201 if handled or not _match(request): 

202 return await route.continue_() 

203 handled = True 

204 await route.fulfill(status=status_code, headers=headers, body=raw) 

205 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже. 

206 await page.unroute(target_url, handler) 

207 installed = False 

208 

209 async def _install(): 

210 nonlocal installed 

211 if installed: 

212 await page.unroute(target_url, handler) 

213 await page.route(target_url, handler) 

214 installed = True 

215 

216 await _install() 

217 

218 async def _on_retry_wrapper(): 

219 await _install() 

220 if on_retry: 

221 await on_retry() 

222 

223 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт 

224 # — поднимем обе ошибки как группу 

225 nav_exc: Exception | None = None 

226 res: Optional[PWResponse] = None 

227 try: 

228 res = await page.goto( 

229 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs 

230 ) 

231 except Exception as e: 

232 nav_exc = e 

233 finally: 

234 unroute_exc: Exception | None = None 

235 if installed: 

236 try: 

237 await page.unroute(target_url, handler) 

238 except Exception as e: 

239 unroute_exc = e 

240 if nav_exc and unroute_exc: 

241 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc)) 

242 if nav_exc: 

243 raise nav_exc 

244 if unroute_exc: 

245 raise unroute_exc 

246 

247 return res 

248 

249 async def fetch( 

250 self, 

251 url: str, 

252 *, 

253 method: HttpMethod = HttpMethod.GET, 

254 headers: Optional[dict[str, str]] = None, 

255 body: Optional[str | list | dict] = None, 

256 credentials: Literal["omit", "same-origin", "include"] = "include", 

257 mode: Literal["cors", "no-cors", "same-origin"] = "cors", 

258 redirect: Literal["follow", "error", "manual"] = "follow", 

259 referrer: Optional[str] = None, 

260 timeout_ms: int = 30000, 

261 retry: int = 2, 

262 ) -> FetchResponse: 

263 """ 

264 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel. 

265 • Без route / wait_for_event. 

266 • raw — ВСЕГДА распакованные байты (если тело доступно JS). 

267 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS. 

268 • `retry` повторяет запрос только при timeout (AbortController по timeout_ms). 

269 """ 

270 if retry < 0: 

271 raise ValueError("retry must be >= 0") 

272 

273 declared_headers = {k.lower(): v for k, v in (headers or {}).items()} 

274 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"} 

275 js_ref = referrer or declared_headers.get("referer") 

276 

277 js_body: Any = body 

278 if isinstance(body, (dict, list)): 

279 js_body = json.dumps(body, ensure_ascii=False) 

280 js_headers["content-type"] = "application/json" 

281 

282 start_t = time.perf_counter() 

283 

284 _JS_PATH = Path(__file__).parent / "fetch.js" 

285 JS_FETCH = _JS_PATH.read_text(encoding="utf-8") 

286 

287 eval_payload = dict( 

288 url=url, 

289 method=method.value, 

290 headers=js_headers or {}, 

291 body=js_body, 

292 credentials=credentials, 

293 mode=mode, 

294 redirect=redirect, 

295 ref=js_ref, 

296 timeoutMs=timeout_ms, 

297 ) 

298 

299 attempts_left = retry 

300 result: Any 

301 while True: 

302 result = await self.evaluate(JS_FETCH, eval_payload) 

303 if result.get("ok"): 

304 break 

305 if result.get("isTimeout") and attempts_left > 0: 

306 attempts_left -= 1 

307 continue 

308 raise RuntimeError(f"fetch failed: {result.get('error')}") 

309 

310 # bytes в raw: распакованные (если body доступен) 

311 b64 = result.get("bodyB64") 

312 raw = base64.b64decode(b64) if isinstance(b64, str) else b"" 

313 

314 # Нормализуем заголовки: если raw есть, 

315 # уберём transport-атрибуты, чтобы не путать потребителя 

316 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()} 

317 if raw: 

318 resp_headers.pop("content-encoding", None) 

319 resp_headers.pop("content-length", None) 

320 

321 req_model = FetchRequest( 

322 page=self, 

323 method=method, 

324 url=URL(full_url=url), 

325 headers=declared_headers, 

326 body=body, 

327 ) 

328 

329 duration = time.perf_counter() - start_t 

330 end_epoch = time.time() 

331 

332 resp_model = FetchResponse( 

333 page=self, 

334 request=req_model, 

335 url=URL(full_url=result.get("finalUrl") or url), 

336 headers=resp_headers, 

337 raw=raw, # всегда bytes; пусто если CORS не дал читать тело 

338 status_code=int(result.get("status", 0)), 

339 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")), 

340 redirected=bool(result.get("redirected", False)), 

341 type=result.get("type", False), 

342 duration=duration, 

343 end_time=end_epoch, 

344 ) 

345 return resp_model 

346 

347 @property 

348 def origin(self) -> str: 

349 url_parts = urlsplit(self.url) 

350 return f"{url_parts.scheme}://{url_parts.netloc}" 

351 

352 async def cookies(self) -> List[Cookie]: 

353 """BrowserContext.cookies 

354 

355 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`. 

356 

357 Returns 

358 ------- 

359 List[{ 

360 name: str, 

361 value: str, 

362 domain: str, 

363 path: str, 

364 expires: float, 

365 httpOnly: bool, 

366 secure: bool, 

367 sameSite: Union["Lax", "None", "Strict"], 

368 partitionKey: Union[str, None] 

369 }] 

370 """ 

371 return await self.context.cookies([self.url]) 

372 

373 async def local_storage(self, **kwargs) -> dict[str, str]: 

374 ls = await self.context.local_storage(**kwargs) 

375 return ls.get(self.origin, {}) 

376 

377 async def session_storage(self) -> dict[str, str]: 

378 return await self.evaluate( 

379 """ 

380 (which) => { 

381 try { 

382 const s = (which in window) ? window[which] : null; 

383 if (!s) return null; 

384 return s; 

385 } catch (_) { 

386 return null; 

387 } 

388 } 

389 """, 

390 "sessionStorage", 

391 ) 

392 

393 async def json(self) -> list | dict: 

394 """ 

395 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre), 

396 сереализует и выдает его. 

397 """ 

398 body = await self.content() 

399 tree = HTMLParser(body) 

400 

401 node = tree.css_first("body > pre") # точный селектор "body > pre" 

402 if node is None: 

403 raise RuntimeError("Содержимое страницы не является json-контейнером") 

404 

405 return loads_json_debug(node.text()) 

406 

407 def __repr__(self) -> str: 

408 return f"<HumanPage wrapping {super().__repr__()!r}>"