Coverage for human_requests/session.py: 88%

157 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-13 21:41 +0000

1""" 

2core.session — unified stateful session for *curl_cffi* and *Playwright*-compatible engines. 

3 

4Main Methods 

5============ 

6* ``Session.request`` — low-level HTTP request (curl_cffi) with cookie jar. 

7* ``Session.goto_page`` — opens a URL in the browser, returns a Page inside 

8 a context manager; upon exit synchronizes cookies + localStorage. 

9* ``Response.render`` — offline render of a pre-fetched Response. 

10 

11Optional Dependencies 

12===================== 

13- playwright-stealth: enabled via `playwright_stealth=True`. 

14 If the package is not installed and the flag is set — raises RuntimeError 

15 with installation instructions. 

16- camoufox: selected with `browser='camoufox'`. 

17- patchright: selected with `browser='patchright'`. 

18- Incompatibility: camoufox/patchright + playwright_stealth cannot be used together. 

19 Raises RuntimeError. 

20 

21 

22Additional 

23========== 

24- Browser launch arguments are assembled via `make_browser_launch_opts()` from: 

25 - `browser_launch_opts` (arbitrary dict) 

26 - `headless` (always overrides the key of the same name) 

27 - `proxy` (string URL or dict) → adapted for Playwright/Patchright/Camoufox 

28- Proxy is also applied to curl_cffi (if no custom `proxy` is passed in .request()). 

29""" 

30 

31from __future__ import annotations 

32 

33from contextlib import asynccontextmanager 

34from time import perf_counter 

35from types import TracebackType 

36from typing import Any, AsyncGenerator, Literal, Mapping, Optional, cast 

37from urllib.parse import urlsplit 

38 

39from curl_cffi import requests as cffi_requests 

40from playwright.async_api import BrowserContext, Page 

41from playwright.async_api import Request as PWRequest 

42from playwright.async_api import Route 

43 

44from .abstraction.cookies import CookieManager 

45from .abstraction.http import URL, HttpMethod 

46from .abstraction.proxy_manager import ParsedProxy 

47from .abstraction.request import Request 

48from .abstraction.response import Response 

49from .browsers import BrowserMaster, Engine 

50from .impersonation import ImpersonationConfig 

51from .tools.helper_tools import ( 

52 build_storage_state_for_context, 

53 handle_nav_with_retries, 

54 merge_storage_state_from_context, 

55) 

56from .tools.http_utils import ( 

57 collect_set_cookie_headers, 

58 compose_cookie_header, 

59 guess_encoding, 

60 parse_set_cookie, 

61) 

62 

63__all__ = ["Session"] 

64 

65 

66class Session: 

67 """curl_cffi.AsyncSession + BrowserMaster + CookieManager.""" 

68 

69 def __init__( 

70 self, 

71 *, 

72 timeout: float = 15.0, 

73 headless: bool = True, 

74 browser: Engine = "chromium", 

75 spoof: ImpersonationConfig | None = None, 

76 playwright_stealth: bool = True, 

77 page_retry: int = 2, 

78 direct_retry: int = 1, 

79 browser_launch_opts: Mapping[str, Any] = {}, 

80 proxy: str | None = None, 

81 ) -> None: 

82 """ 

83 Args: 

84 timeout: default timeout for both direct and goto requests 

85 headless: launch mode (passed into browser launch arguments) 

86 browser: chromium/firefox/webkit — standard; camoufox/patchright — special builds 

87 spoof: configuration for direct requests 

88 playwright_stealth: hides certain automation browser signatures 

89 page_retry: number of "soft" retries for page navigation (after the initial attempt) 

90 direct_retry: retries for direct requests on curl_cffi Timeout (after first attempt) 

91 """ 

92 self.timeout: float = timeout 

93 """Timeout for goto/direct requests.""" 

94 

95 self.headless: bool = bool(headless) 

96 """Whether to run the browser in headless mode.""" 

97 

98 self.browser_name: Engine = browser 

99 """Current browser (chromium/firefox/webkit/camoufox/patchright).""" 

100 

101 self.spoof: ImpersonationConfig = spoof or ImpersonationConfig() 

102 """Impersonation settings (user-agent, TLS, client-hello).""" 

103 

104 self.playwright_stealth: bool = bool(playwright_stealth) 

105 """Hide certain automation signatures? 

106 Implemented via JS injection. Some sites may detect this.""" 

107 

108 self.page_retry: int = int(page_retry) 

109 """If a timeout occurs after N seconds — retry with page.reload().""" 

110 

111 self.direct_retry: int = int(direct_retry) 

112 """If a timeout occurs after N seconds — retry the direct request.""" 

113 

114 if self.browser_name in ("camoufox", "patchright") and self.playwright_stealth: 

115 raise RuntimeError( 

116 "playwright_stealth=True is incompatible with browser='camoufox'/'patchright'. " 

117 "Disable stealth or use chromium/firefox/webkit." 

118 ) 

119 

120 # Custom browser launch parameters + proxy 

121 self.browser_launch_opts: Mapping[str, Any] = browser_launch_opts 

122 """Browser launch arguments (arbitrary keys).""" 

123 

124 self.proxy: str | dict[str, str] | None = proxy 

125 """ 

126 Proxy server, one of: 

127 

128 a. URL string in the form: `schema://user:pass@host:port` 

129 

130 b. playwright-like dict 

131 """ 

132 

133 # Cookie/localStorage state 

134 self.cookies: CookieManager = CookieManager([]) 

135 """Storage of all active cookies.""" 

136 

137 self.local_storage: dict[str, dict[str, str]] = {} 

138 """localStorage from the last browser context (goto run).""" 

139 

140 # Низкоуровневый HTTP 

141 self._curl: Optional[cffi_requests.AsyncSession] = None 

142 

143 # Браузерный движок — через мастер (всегда отдаёт Browser) 

144 self._bm: BrowserMaster = BrowserMaster( 

145 engine=self.browser_name, 

146 stealth=self.playwright_stealth, 

147 launch_opts=self._make_browser_launch_opts(), # первичный снапшот 

148 ) 

149 

150 # ──────────────── Launch args & proxy helpers ──────────────── 

151 def _make_browser_launch_opts(self) -> dict[str, Any]: 

152 """ 

153 Merges launch arguments for BrowserMaster from Session settings. 

154 

155 Sources: 

156 - self.browser_launch_opts (arbitrary keys) 

157 - self.headless (overrides the key of the same name) 

158 - self.proxy (URL string or dict) → converted to Playwright-style proxy 

159 """ 

160 opts = dict(self.browser_launch_opts) 

161 opts["headless"] = bool(self.headless) 

162 

163 pw_proxy = ParsedProxy.from_any(self.proxy) 

164 if pw_proxy is not None: 

165 opts["proxy"] = pw_proxy.for_playwright() 

166 

167 return opts 

168 

169 # ────── HTTP через curl_cffi ────── 

170 async def request( 

171 self, 

172 method: HttpMethod | str, 

173 url: str, 

174 *, 

175 headers: Optional[Mapping[str, str]] = None, 

176 retry: int | None = None, 

177 **kwargs: Any, 

178 ) -> Response: 

179 """ 

180 Standard fast request via curl_cffi. 

181 You must provide either an HttpMethod or its string representation, as well as a URL. 

182 

183 Optionally, you can pass additional headers. 

184 

185 Extra parameters can be passed through **kwargs to curl_cffi.AsyncSession.request 

186 (see their documentation for details). 

187 Retries are performed ONLY on cffi Timeout: ``curl_cffi.requests.exceptions.Timeout``. 

188 """ 

189 method_enum = method if isinstance(method, HttpMethod) else HttpMethod[str(method).upper()] 

190 base_headers = {k.lower(): v for k, v in (headers or {}).items()} 

191 

192 # lazy curl session 

193 if self._curl is None: 

194 self._curl = cffi_requests.AsyncSession() 

195 

196 curl = self._curl 

197 assert curl is not None # для mypy: ниже уже не union 

198 

199 # spoof UA / headers 

200 imper_profile = self.spoof.choose(self.browser_name) 

201 base_headers.update(self.spoof.forge_headers(imper_profile)) 

202 

203 # Cookie header (фиксируем один раз на первую попытку) 

204 url_parts = urlsplit(url) 

205 cookie_header, sent_cookies = compose_cookie_header( 

206 url_parts, base_headers, list(self.cookies) 

207 ) 

208 if cookie_header: 

209 base_headers["cookie"] = cookie_header 

210 

211 # proxies по умолчанию из Session.proxy, если пользователь не передал свои 

212 pp_user_proxies = ParsedProxy.from_any(kwargs.pop("proxy", None)) 

213 user_proxies = None 

214 if pp_user_proxies: 

215 user_proxies = pp_user_proxies.for_curl() 

216 

217 pp_default_proxies = ParsedProxy.from_any(self.proxy) 

218 default_proxies = None 

219 if pp_default_proxies: 

220 default_proxies = pp_default_proxies.for_curl() 

221 

222 attempts_left = self.direct_retry if retry is None else int(retry) 

223 last_err: Exception | None = None 

224 

225 async def _do_request() -> tuple[Any, float]: 

226 req_headers = dict(base_headers) # копия на попытку 

227 t0 = perf_counter() 

228 r = await curl.request( 

229 method_enum.value, 

230 url, 

231 headers=req_headers, 

232 impersonate=cast( # сузить тип до Literal набора curl_cffi 

233 "cffi_requests.impersonate.BrowserTypeLiteral", imper_profile 

234 ), 

235 timeout=self.timeout, 

236 proxy=user_proxies if user_proxies is not None else default_proxies, 

237 **kwargs, 

238 ) 

239 duration = perf_counter() - t0 

240 return r, duration 

241 

242 # первая попытка + мягкие повторы на Timeout 

243 try: 

244 r, duration = await _do_request() 

245 except cffi_requests.exceptions.Timeout as e: 

246 last_err = e 

247 while attempts_left > 0: 

248 attempts_left -= 1 

249 try: 

250 r, duration = await _do_request() 

251 last_err = None 

252 break 

253 except cffi_requests.exceptions.Timeout as e2: 

254 last_err = e2 

255 if last_err is not None: 

256 raise last_err 

257 

258 # response → cookies 

259 resp_headers = {k.lower(): v for k, v in r.headers.items()} 

260 raw_sc = collect_set_cookie_headers(r.headers) 

261 resp_cookies = parse_set_cookie(raw_sc, url_parts.hostname or "") 

262 self.cookies.add(resp_cookies) 

263 

264 charset = guess_encoding(resp_headers) 

265 body_text = r.content.decode(charset, errors="replace") 

266 

267 data = kwargs.get("data") 

268 json_body = kwargs.get("json") 

269 files = kwargs.get("files") 

270 

271 # models 

272 req_model = Request( 

273 method=method_enum, 

274 url=URL(full_url=url), 

275 headers=dict(base_headers), 

276 body=data or json_body or files or None, 

277 cookies=sent_cookies, 

278 ) 

279 resp_model = Response( 

280 request=req_model, 

281 url=URL(full_url=str(r.url)), 

282 headers=resp_headers, 

283 cookies=resp_cookies, 

284 body=body_text, 

285 status_code=r.status_code, 

286 duration=duration, 

287 _render_callable=self._render_response, 

288 ) 

289 return resp_model 

290 

291 # ────── browser nav ────── 

292 @asynccontextmanager 

293 async def goto_page( 

294 self, 

295 url: str, 

296 *, 

297 wait_until: Literal["commit", "load", "domcontentloaded", "networkidle"] = "commit", 

298 retry: int | None = None, 

299 ) -> AsyncGenerator[Page, None]: 

300 """ 

301 Opens a page in the browser using a one-time context. 

302 Retries perform a "soft reload" without recreating the context. 

303 """ 

304 # Обновляем launch-аргументы в мастере перед стартом 

305 self._bm.launch_opts = self._make_browser_launch_opts() 

306 await self._bm.start() 

307 

308 storage_state = build_storage_state_for_context( 

309 local_storage=self.local_storage, 

310 cookie_manager=self.cookies, 

311 ) 

312 ctx = await self._bm.new_context(storage_state=storage_state) 

313 page = await ctx.new_page() 

314 timeout_ms = int(self.timeout * 1000) 

315 attempts_left = self.page_retry if retry is None else int(retry) 

316 

317 try: 

318 await handle_nav_with_retries( 

319 page, 

320 target_url=url, 

321 wait_until=wait_until, 

322 timeout_ms=timeout_ms, 

323 attempts=attempts_left, 

324 on_retry=None, 

325 ) 

326 yield page 

327 finally: 

328 self.local_storage = await merge_storage_state_from_context( 

329 ctx, cookie_manager=self.cookies 

330 ) 

331 await page.close() 

332 await ctx.close() 

333 

334 # ────── Offline render ────── 

335 @asynccontextmanager 

336 async def _render_response( 

337 self, 

338 response: Response, 

339 *, 

340 wait_until: Literal["load", "domcontentloaded", "networkidle"] = "domcontentloaded", 

341 retry: int | None = None, 

342 ) -> AsyncGenerator[Page, None]: 

343 """ 

344 Offline render of a Response: creates a temporary context (with our storage_state), 

345 intercepts the first request and responds with the prepared body. 

346 Retries do not recreate the context/page — instead a "soft reload" is performed, 

347 reattaching the route on retry. 

348 """ 

349 # Обновляем launch-аргументы в мастере перед стартом 

350 self._bm.launch_opts = self._make_browser_launch_opts() 

351 await self._bm.start() 

352 

353 storage_state = build_storage_state_for_context( 

354 local_storage=self.local_storage, 

355 cookie_manager=self.cookies, 

356 ) 

357 ctx: BrowserContext = await self._bm.new_context(storage_state=cast(Any, storage_state)) 

358 timeout_ms = int(self.timeout * 1000) 

359 attempts_left = self.page_retry if retry is None else int(retry) 

360 

361 async def _attach_route_once() -> None: 

362 await ctx.unroute("**/*") 

363 

364 async def handler(route: Route, _req: PWRequest) -> None: 

365 await route.fulfill( 

366 status=response.status_code, 

367 headers=dict(response.headers), 

368 body=response.body.encode("utf-8"), 

369 ) 

370 

371 await ctx.route("**/*", handler, times=1) 

372 

373 await _attach_route_once() 

374 page = await ctx.new_page() 

375 

376 try: 

377 

378 async def _on_retry() -> None: 

379 await _attach_route_once() 

380 

381 await handle_nav_with_retries( 

382 page, 

383 target_url=response.url.full_url, 

384 wait_until=wait_until, 

385 timeout_ms=timeout_ms, 

386 attempts=attempts_left, 

387 on_retry=_on_retry, 

388 ) 

389 yield page 

390 finally: 

391 self.local_storage = await merge_storage_state_from_context( 

392 ctx, cookie_manager=self.cookies 

393 ) 

394 await page.close() 

395 await ctx.close() 

396 

397 # ────── cleanup ────── 

398 async def close(self) -> None: 

399 # Закрываем браузерные движки 

400 await self._bm.close() 

401 # Закрываем HTTP-сессию 

402 if self._curl: 

403 await self._curl.close() 

404 self._curl = None 

405 

406 # поддержка «async with» 

407 async def __aenter__(self) -> "Session": 

408 return self 

409 

410 async def __aexit__( 

411 self, 

412 exc_type: Optional[type[BaseException]], 

413 exc: Optional[BaseException], 

414 tb: Optional[TracebackType], 

415 ) -> None: 

416 await self.close()