Coverage for human_requests / human_page.py: 68%

161 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-25 10:02 +0000

1from __future__ import annotations 

2 

3import base64 

4import json 

5import time 

6from pathlib import Path 

7from typing import TYPE_CHECKING, Any, Awaitable, Callable, List, Literal, Optional, cast 

8from urllib.parse import urlsplit 

9 

10from playwright.async_api import Cookie, Page 

11from playwright.async_api import Response as PWResponse 

12from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

13from selectolax.parser import HTMLParser 

14from typing_extensions import override 

15 

16from .abstraction.http import URL, HttpMethod 

17from .abstraction.request import FetchRequest 

18from .abstraction.response import FetchResponse 

19 

20if TYPE_CHECKING: 

21 from .human_context import HumanContext 

22 

23 

24class HumanPage(Page): 

25 """ 

26 A thin, type-compatible wrapper over Playwright's Page. 

27 """ 

28 

29 # ---------- core identity ---------- 

30 

31 @property 

32 @override 

33 def context(self) -> "HumanContext": 

34 # рантайм остаётся прежним; только уточняем тип 

35 return cast("HumanContext", super().context) 

36 

37 @staticmethod 

38 def replace(playwright_page: Page) -> HumanPage: 

39 """Подменяет стандартный Playwright класс с сохранением содержимого.""" 

40 from .human_context import HumanContext # avoid circular import 

41 

42 if isinstance(playwright_page.context, HumanContext) is False: 

43 raise TypeError("The provided Page's context is not a HumanContext") 

44 

45 playwright_page.__class__ = HumanPage 

46 return playwright_page # type: ignore[return-value] 

47 

48 # ---------- lifecycle / sync ---------- 

49 

50 @override 

51 async def goto( 

52 self, 

53 url: str, 

54 *, 

55 retry: Optional[int] = None, 

56 on_retry: Optional[Callable[[], Awaitable[None]]] = None, 

57 # standard Playwright kwargs (not exhaustive; forwarded via **kwargs): 

58 **kwargs: Any, 

59 ) -> Optional[PWResponse]: 

60 """ 

61 Navigate to `url` with optional retry-on-timeout. 

62 

63 If the initial navigation raises a Playwright `TimeoutError`, this method performs up to 

64 `retry` *soft* reloads (`Page.reload`) using the same `wait_until`/`timeout` settings. 

65 Before each retry, the optional `on_retry` hook is awaited so you can (re)attach 

66 one-shot listeners, route handlers, subscriptions, etc., that would otherwise be spent. 

67 

68 Parameters 

69 ---------- 

70 url : str 

71 Absolute URL to navigate to. 

72 retry : int | None, optional 

73 Number of soft reload attempts after a timeout (0 means no retries). 

74 If None, defaults to `session.page_retry`. 

75 on_retry : Callable[[], Awaitable[None]] | None, optional 

76 Async hook called before each retry; use it to re-register any one-shot 

77 event handlers or routes needed for the next attempt. 

78 timeout : float | None, optional 

79 Navigation timeout in milliseconds. If None, falls back to `session.timeout * 1000`. 

80 wait_until : {"commit", "domcontentloaded", "load", "networkidle"} | None, optional 

81 When to consider the navigation successful (forwarded to Playwright). 

82 referer : str | None, optional 

83 Per-request `Referer` header 

84 (overrides headers set via `page.set_extra_http_headers()`). 

85 **kwargs : Any 

86 Any additional keyword arguments are forwarded to Playwright's `Page.goto`. 

87 

88 Returns 

89 ------- 

90 playwright.async_api.Response | None 

91 The main resource `Response`, or `None` for `about:blank` and same-URL hash navigations. 

92 

93 Raises 

94 ------ 

95 playwright.async_api.TimeoutError 

96 If the initial navigation and all retries time out. 

97 Any other exceptions from `Page.goto` / `Page.reload` may also propagate. 

98 

99 Notes 

100 ----- 

101 - Soft reloads reuse the same `wait_until`/`timeout` pair to keep behavior consistent 

102 across attempts. 

103 - Because one-shot handlers are consumed after a failed attempt, always re-attach them 

104 inside `on_retry` if the navigation logic depends on them. 

105 """ 

106 # Build the kwargs for the underlying goto/reload calls: 

107 try: 

108 return await super().goto(url, **kwargs) 

109 except PlaywrightTimeoutError as last_err: 

110 attempts_left = ( 

111 int(retry) + 1 if retry is not None else 1 

112 ) # +1 т.к. первый запрос базис 

113 while attempts_left > 0: 

114 attempts_left -= 1 

115 if on_retry is not None: 

116 await on_retry() 

117 try: 

118 # Soft refresh with the SAME wait_until/timeout 

119 await super().reload( 

120 **{k: kwargs[k] for k in ("wait_until", "timeout") if k in kwargs} 

121 ) 

122 last_err = None 

123 break 

124 except PlaywrightTimeoutError as e: 

125 last_err = e 

126 if last_err is not None: 

127 raise last_err 

128 

129 async def goto_render(self, first, /, **goto_kwargs) -> Optional[PWResponse]: 

130 """ 

131 Перехватывает первый навигационный запрос main-frame к target_url и 

132 отдаёт синтетический ответ, затем делает обычный page.goto(...). 

133 Возвращает Optional[PWResponse] как и goto. 

134 """ 

135 

136 # -------- helpers (локально и коротко) --------------------------------- 

137 def _to_bytes(data: bytes | bytearray | memoryview | str) -> bytes: 

138 return ( 

139 data 

140 if isinstance(data, bytes) 

141 else ( 

142 bytes(data) 

143 if isinstance(data, (bytearray, memoryview)) 

144 else data.encode("utf-8", "replace") 

145 ) 

146 ) 

147 

148 def _is_html(b: bytes) -> bool: 

149 s = b[:512].lstrip().lower() 

150 return s.startswith(b"<!doctype html") or s.startswith(b"<html") or b"<body" in s 

151 

152 def _norm_args() -> tuple[str, bytes, int, dict[str, str]]: 

153 if isinstance(first, FetchResponse): 

154 url = first.url.full_url 

155 body = _to_bytes(first.raw or b"") 

156 code = int(first.status_code) 

157 hdrs = dict(first.headers or {}) 

158 else: 

159 url = str(first) 

160 if "body" not in goto_kwargs: 

161 raise TypeError("goto_render(url=..., *, body=...) is required") 

162 body = _to_bytes(goto_kwargs.pop("body")) 

163 code = int(goto_kwargs.pop("status_code", 200)) 

164 hdrs = dict(goto_kwargs.pop("headers", {}) or {}) 

165 # убрать транспортные, поставить content-type при html 

166 drop = {"content-length", "content-encoding", "transfer-encoding", "connection"} 

167 clean = {k: v for k, v in hdrs.items() if k.lower() not in drop} 

168 if body and not any(k.lower() == "content-type" for k in clean) and _is_html(body): 

169 clean["content-type"] = "text/html; charset=utf-8" 

170 return url, body, code, clean 

171 

172 # Переназначим ретраи до того, как их прочитает goto 

173 retry = goto_kwargs.pop("retry", None) 

174 on_retry = goto_kwargs.pop("on_retry", None) 

175 

176 target_url, raw, status_code, headers = _norm_args() 

177 page = self 

178 main_frame = page.main_frame 

179 target_wo_hash = urlsplit(target_url)._replace(fragment="").geturl() 

180 

181 handled = False 

182 installed = False 

183 

184 def _match(req) -> bool: 

185 if ( 

186 req.frame is not main_frame 

187 or not req.is_navigation_request() 

188 or req.resource_type != "document" 

189 ): 

190 return False 

191 return urlsplit(req.url)._replace(fragment="").geturl() == target_wo_hash 

192 

193 async def handler(route, request): 

194 nonlocal handled, installed 

195 if handled or not _match(request): 

196 return await route.continue_() 

197 handled = True 

198 await route.fulfill(status=status_code, headers=headers, body=raw) 

199 # Снимем маршрут сразу; если упадёт — не скрываем: пусть всплывёт позже. 

200 await page.unroute(target_url, handler) 

201 installed = False 

202 

203 async def _install(): 

204 nonlocal installed 

205 if installed: 

206 await page.unroute(target_url, handler) 

207 await page.route(target_url, handler) 

208 installed = True 

209 

210 await _install() 

211 

212 async def _on_retry_wrapper(): 

213 await _install() 

214 if on_retry: 

215 await on_retry() 

216 

217 # НИЧЕГО не прячем: если goto упадёт, а затем ещё и unroute упадёт 

218 # — поднимем обе ошибки как группу 

219 nav_exc: Exception | None = None 

220 res: Optional[PWResponse] = None 

221 try: 

222 res = await page.goto( 

223 target_url, retry=retry, on_retry=_on_retry_wrapper, **goto_kwargs 

224 ) 

225 except Exception as e: 

226 nav_exc = e 

227 finally: 

228 unroute_exc: Exception | None = None 

229 if installed: 

230 try: 

231 await page.unroute(target_url, handler) 

232 except Exception as e: 

233 unroute_exc = e 

234 if nav_exc and unroute_exc: 

235 raise ExceptionGroup("goto_render failed", (nav_exc, unroute_exc)) 

236 if nav_exc: 

237 raise nav_exc 

238 if unroute_exc: 

239 raise unroute_exc 

240 

241 return res 

242 

243 async def fetch( 

244 self, 

245 url: str, 

246 *, 

247 method: HttpMethod = HttpMethod.GET, 

248 headers: Optional[dict[str, str]] = None, 

249 body: Optional[str | list | dict] = None, 

250 credentials: Literal["omit", "same-origin", "include"] = "include", 

251 mode: Literal["cors", "no-cors", "same-origin"] = "cors", 

252 redirect: Literal["follow", "error", "manual"] = "follow", 

253 referrer: Optional[str] = None, 

254 timeout_ms: int = 30000, 

255 ) -> FetchResponse: 

256 """ 

257 Тонкая прослойка над JS fetch: выполняет запрос внутри страницы и возвращает ResponseModel. 

258 • Без route / wait_for_event. 

259 • raw — ВСЕГДА распакованные байты (если тело доступно JS). 

260 • При opaque-ответе тело/заголовки могут быть недоступны — это ограничение CORS. 

261 """ 

262 declared_headers = {k.lower(): v for k, v in (headers or {}).items()} 

263 js_headers = {k: v for k, v in declared_headers.items() if k != "referer"} 

264 js_ref = referrer or declared_headers.get("referer") 

265 

266 js_body: Any = body 

267 if isinstance(body, (dict, list)): 

268 js_body = json.dumps(body, ensure_ascii=False) 

269 js_headers["content-type"] = "application/json" 

270 

271 start_t = time.perf_counter() 

272 

273 _JS_PATH = Path(__file__).parent / "fetch.js" 

274 JS_FETCH = _JS_PATH.read_text(encoding="utf-8") 

275 

276 result = await self.evaluate( 

277 JS_FETCH, 

278 dict( 

279 url=url, 

280 method=method.value, 

281 headers=js_headers or {}, 

282 body=js_body, 

283 credentials=credentials, 

284 mode=mode, 

285 redirect=redirect, 

286 ref=js_ref, 

287 timeoutMs=timeout_ms, 

288 ), 

289 ) 

290 

291 if not result.get("ok"): 

292 raise RuntimeError(f"fetch failed: {result.get('error')}") 

293 

294 # bytes в raw: распакованные (если body доступен) 

295 b64 = result.get("bodyB64") 

296 raw = base64.b64decode(b64) if isinstance(b64, str) else b"" 

297 

298 # Нормализуем заголовки: если raw есть, 

299 # уберём transport-атрибуты, чтобы не путать потребителя 

300 resp_headers = {k.lower(): v for k, v in (result.get("headers") or {}).items()} 

301 if raw: 

302 resp_headers.pop("content-encoding", None) 

303 resp_headers.pop("content-length", None) 

304 

305 req_model = FetchRequest( 

306 page=self, 

307 method=method, 

308 url=URL(full_url=url), 

309 headers=declared_headers, 

310 body=body, 

311 ) 

312 

313 duration = time.perf_counter() - start_t 

314 end_epoch = time.time() 

315 

316 resp_model = FetchResponse( 

317 page=self, 

318 request=req_model, 

319 url=URL(full_url=result.get("finalUrl") or url), 

320 headers=resp_headers, 

321 raw=raw, # всегда bytes; пусто если CORS не дал читать тело 

322 status_code=int(result.get("status", 0)), 

323 status_text=str(result.get("statusText", "STATUS TEXT NOT AVAILABLE")), 

324 redirected=bool(result.get("redirected", False)), 

325 type=result.get("type", False), 

326 duration=duration, 

327 end_time=end_epoch, 

328 ) 

329 return resp_model 

330 

331 @property 

332 def origin(self) -> str: 

333 url_parts = urlsplit(self.url) 

334 return f"{url_parts.scheme}://{url_parts.netloc}" 

335 

336 async def cookies(self) -> List[Cookie]: 

337 """BrowserContext.cookies 

338 

339 Cookies for the current page URL. Alias for `page.context.cookies([page.url])`. 

340 

341 Returns 

342 ------- 

343 List[{ 

344 name: str, 

345 value: str, 

346 domain: str, 

347 path: str, 

348 expires: float, 

349 httpOnly: bool, 

350 secure: bool, 

351 sameSite: Union["Lax", "None", "Strict"], 

352 partitionKey: Union[str, None] 

353 }] 

354 """ 

355 return await self.context.cookies([self.url]) 

356 

357 async def local_storage(self, **kwargs) -> dict[str, str]: 

358 ls = await self.context.local_storage(**kwargs) 

359 return ls.get(self.origin, {}) 

360 

361 async def session_storage(self) -> dict[str, str]: 

362 return await self.evaluate( 

363 """ 

364 (which) => { 

365 try { 

366 const s = (which in window) ? window[which] : null; 

367 if (!s) return null; 

368 return s; 

369 } catch (_) { 

370 return null; 

371 } 

372 } 

373 """, 

374 "sessionStorage", 

375 ) 

376 

377 async def json(self) -> list | dict: 

378 """ 

379 Если контент страницы это json - парсит (браузер всегда оборачивает его в body->pre), 

380 сереализует и выдает его. 

381 """ 

382 body = await self.content() 

383 tree = HTMLParser(body) 

384 

385 node = tree.css_first("body > pre") # точный селектор "body > pre" 

386 if node is None: 

387 raise RuntimeError("Содержимое страницы не является json-контейнером") 

388 

389 return json.loads(node.text()) 

390 

391 def __repr__(self) -> str: 

392 return f"<HumanPage wrapping {super().__repr__()!r}>"