515 lines
17 KiB
Python
515 lines
17 KiB
Python
import asyncio
|
|
import re
|
|
from enum import Enum
|
|
from functools import partialmethod
|
|
from io import BytesIO
|
|
from json import dumps
|
|
from typing import Callable, Dict, List, Optional, Tuple, Union, cast
|
|
from urllib.parse import ParseResult, parse_qsl, unquote, urlencode, urlparse
|
|
|
|
from .. import AsyncCurl, Curl, CurlError, CurlInfo, CurlOpt
|
|
from .cookies import Cookies, CookieTypes, Request, Response
|
|
from .errors import RequestsError
|
|
from .headers import Headers, HeaderTypes
|
|
|
|
|
|
class BrowserType(str, Enum):
|
|
edge99 = "edge99"
|
|
edge101 = "edge101"
|
|
chrome99 = "chrome99"
|
|
chrome100 = "chrome100"
|
|
chrome101 = "chrome101"
|
|
chrome104 = "chrome104"
|
|
chrome107 = "chrome107"
|
|
chrome110 = "chrome110"
|
|
chrome99_android = "chrome99_android"
|
|
safari15_3 = "safari15_3"
|
|
safari15_5 = "safari15_5"
|
|
|
|
@classmethod
|
|
def has(cls, item):
|
|
return item in cls.__members__
|
|
|
|
|
|
def _update_url_params(url: str, params: Dict) -> str:
|
|
"""Add GET params to provided URL being aware of existing.
|
|
|
|
:param url: string of target URL
|
|
:param params: dict containing requested params to be added
|
|
:return: string with updated URL
|
|
|
|
>> url = 'http://stackoverflow.com/test?answers=true'
|
|
>> new_params = {'answers': False, 'data': ['some','values']}
|
|
>> _update_url_params(url, new_params)
|
|
'http://stackoverflow.com/test?data=some&data=values&answers=false'
|
|
"""
|
|
# Unquoting URL first so we don't loose existing args
|
|
url = unquote(url)
|
|
# Extracting url info
|
|
parsed_url = urlparse(url)
|
|
# Extracting URL arguments from parsed URL
|
|
get_args = parsed_url.query
|
|
# Converting URL arguments to dict
|
|
parsed_get_args = dict(parse_qsl(get_args))
|
|
# Merging URL arguments dict with new params
|
|
parsed_get_args.update(params)
|
|
|
|
# Bool and Dict values should be converted to json-friendly values
|
|
# you may throw this part away if you don't like it :)
|
|
parsed_get_args.update(
|
|
{k: dumps(v) for k, v in parsed_get_args.items() if isinstance(v, (bool, dict))}
|
|
)
|
|
|
|
# Converting URL argument to proper query string
|
|
encoded_get_args = urlencode(parsed_get_args, doseq=True)
|
|
# Creating new parsed result object based on provided with new
|
|
# URL arguments. Same thing happens inside of urlparse.
|
|
new_url = ParseResult(
|
|
parsed_url.scheme,
|
|
parsed_url.netloc,
|
|
parsed_url.path,
|
|
parsed_url.params,
|
|
encoded_get_args,
|
|
parsed_url.fragment,
|
|
).geturl()
|
|
|
|
return new_url
|
|
|
|
|
|
def _update_header_line(header_lines: List[str], key: str, value: str):
|
|
for idx, line in enumerate(header_lines):
|
|
if line.lower().startswith(key + ":"):
|
|
header_lines[idx] = f"{key}: {value}"
|
|
break
|
|
else: # if not break
|
|
header_lines.append(f"{key}: {value}")
|
|
|
|
|
|
class BaseSession:
|
|
__attrs__ = [
|
|
"headers",
|
|
"cookies",
|
|
"auth",
|
|
"proxies",
|
|
"params",
|
|
"verify",
|
|
"cert",
|
|
"stream", # TODO
|
|
"trust_env", # TODO
|
|
"max_redirects",
|
|
"impersonate",
|
|
"timeout",
|
|
]
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
headers: Optional[HeaderTypes] = None,
|
|
cookies: Optional[CookieTypes] = None,
|
|
auth: Optional[Tuple[str, str]] = None,
|
|
proxies: Optional[dict] = None,
|
|
params: Optional[dict] = None,
|
|
verify: bool = True,
|
|
timeout: Union[float, Tuple[float, float]] = 30,
|
|
trust_env: bool = True,
|
|
max_redirects: int = -1,
|
|
impersonate: Optional[Union[str, BrowserType]] = None,
|
|
):
|
|
self.headers = Headers(headers)
|
|
self.cookies = Cookies(cookies)
|
|
self.auth = auth
|
|
self.proxies = proxies
|
|
self.params = params
|
|
self.verify = verify
|
|
self.timeout = timeout
|
|
self.trust_env = trust_env
|
|
self.max_redirects = max_redirects
|
|
self.impersonate = impersonate
|
|
|
|
def _set_curl_options(
|
|
self,
|
|
curl,
|
|
method: str,
|
|
url: str,
|
|
params: Optional[dict] = None,
|
|
data: Optional[Union[Dict[str, str], str, BytesIO, bytes]] = None,
|
|
json: Optional[dict] = None,
|
|
headers: Optional[HeaderTypes] = None,
|
|
cookies: Optional[CookieTypes] = None,
|
|
files: Optional[Dict] = None,
|
|
auth: Optional[Tuple[str, str]] = None,
|
|
timeout: Optional[Union[float, Tuple[float, float]]] = None,
|
|
allow_redirects: bool = True,
|
|
max_redirects: Optional[int] = None,
|
|
proxies: Optional[dict] = None,
|
|
verify: Optional[bool] = None,
|
|
referer: Optional[str] = None,
|
|
accept_encoding: Optional[str] = "gzip, deflate, br",
|
|
content_callback: Optional[Callable] = None,
|
|
impersonate: Optional[Union[str, BrowserType]] = None,
|
|
):
|
|
c = curl
|
|
|
|
# method
|
|
c.setopt(CurlOpt.CUSTOMREQUEST, method.encode())
|
|
|
|
# url
|
|
if self.params:
|
|
url = _update_url_params(url, self.params)
|
|
if params:
|
|
url = _update_url_params(url, params)
|
|
c.setopt(CurlOpt.URL, url.encode())
|
|
|
|
# data/body/json
|
|
if isinstance(data, dict):
|
|
body = urlencode(data).encode()
|
|
elif isinstance(data, str):
|
|
body = data.encode()
|
|
elif isinstance(data, BytesIO):
|
|
body = data.read()
|
|
elif isinstance(data, bytes):
|
|
body = data
|
|
elif data is None:
|
|
body = b""
|
|
else:
|
|
raise TypeError("data must be dict, str, BytesIO or bytes")
|
|
if json:
|
|
body = dumps(json).encode()
|
|
if body:
|
|
c.setopt(CurlOpt.POSTFIELDS, body)
|
|
# necessary if body contains '\0'
|
|
c.setopt(CurlOpt.POSTFIELDSIZE, len(body))
|
|
|
|
# headers
|
|
h = Headers(self.headers)
|
|
h.update(headers)
|
|
|
|
# cookies
|
|
co = Cookies(self.cookies)
|
|
co.update(cookies)
|
|
req = Request(url=url, headers=h, method=method)
|
|
co.set_cookie_header(req)
|
|
|
|
# An alternative way to implement cookiejar is to use curl's builtin cookiejar,
|
|
# However, it would be diffcult to interploate with Headers and get cookies as
|
|
# dicta
|
|
# c.setopt(CurlOpt.COOKIE, cookies_str.encode())
|
|
|
|
header_lines = []
|
|
for k, v in h.multi_items():
|
|
header_lines.append(f"{k}: {v}")
|
|
if json:
|
|
_update_header_line(header_lines, "Content-Type", "application/json")
|
|
if isinstance(data, dict):
|
|
_update_header_line(
|
|
header_lines, "Content-Type", "application/x-www-form-urlencoded"
|
|
)
|
|
# print("header lines", header_lines)
|
|
c.setopt(CurlOpt.HTTPHEADER, [h.encode() for h in header_lines])
|
|
|
|
# files
|
|
if files:
|
|
raise NotImplementedError("Files has not been implemented.")
|
|
|
|
# auth
|
|
if self.auth or auth:
|
|
if self.auth:
|
|
username, password = self.auth
|
|
if auth:
|
|
username, password = auth
|
|
c.setopt(CurlOpt.USERNAME, username.encode()) # type: ignore
|
|
c.setopt(CurlOpt.PASSWORD, password.encode()) # type: ignore
|
|
|
|
# timeout
|
|
timeout = timeout or self.timeout
|
|
if isinstance(timeout, tuple):
|
|
connect_timeout, read_timeout = timeout
|
|
all_timeout = connect_timeout + read_timeout
|
|
c.setopt(CurlOpt.CONNECTTIMEOUT_MS, int(connect_timeout * 1000))
|
|
c.setopt(CurlOpt.TIMEOUT_MS, int(all_timeout * 1000))
|
|
else:
|
|
c.setopt(CurlOpt.TIMEOUT_MS, int(timeout * 1000))
|
|
|
|
# allow_redirects
|
|
c.setopt(CurlOpt.FOLLOWLOCATION, int(allow_redirects))
|
|
|
|
# max_redirects
|
|
c.setopt(CurlOpt.MAXREDIRS, max_redirects or self.max_redirects)
|
|
|
|
# proxies
|
|
if self.proxies:
|
|
proxies = {**self.proxies, **(proxies or {})}
|
|
if proxies:
|
|
if url.startswith("http://"):
|
|
if proxies["http"] is not None:
|
|
c.setopt(CurlOpt.PROXY, proxies["http"])
|
|
elif url.startswith("https://"):
|
|
if proxies["https"] is not None:
|
|
if proxies["https"].startswith("https://"):
|
|
raise RequestsError(
|
|
"You are using http proxy WRONG, the prefix should be 'http://' not 'https://',"
|
|
"see: https://github.com/yifeikong/curl_cffi/issues/6"
|
|
)
|
|
c.setopt(CurlOpt.PROXY, proxies["https"])
|
|
# for http proxy, need to tell curl to enable tunneling
|
|
if not proxies["https"].startswith("socks"):
|
|
c.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
|
|
|
|
# verify
|
|
if verify is False or not self.verify and verify is None:
|
|
c.setopt(CurlOpt.SSL_VERIFYPEER, 0)
|
|
c.setopt(CurlOpt.SSL_VERIFYHOST, 0)
|
|
|
|
# referer
|
|
if referer:
|
|
c.setopt(CurlOpt.REFERER, referer.encode())
|
|
|
|
# accept_encoding
|
|
if accept_encoding is not None:
|
|
c.setopt(CurlOpt.ACCEPT_ENCODING, accept_encoding.encode())
|
|
|
|
# impersonate
|
|
impersonate = impersonate or self.impersonate
|
|
if impersonate:
|
|
if not BrowserType.has(impersonate):
|
|
raise RequestsError(f"impersonate {impersonate} is not supported")
|
|
c.impersonate(impersonate)
|
|
|
|
# import pdb; pdb.set_trace()
|
|
if content_callback is None:
|
|
buffer = BytesIO()
|
|
c.setopt(CurlOpt.WRITEDATA, buffer)
|
|
else:
|
|
buffer = None
|
|
c.setopt(CurlOpt.WRITEFUNCTION, content_callback)
|
|
header_buffer = BytesIO()
|
|
c.setopt(CurlOpt.HEADERDATA, header_buffer)
|
|
|
|
return req, buffer, header_buffer
|
|
|
|
def _parse_response(self, curl, req: Request, buffer, header_buffer):
|
|
c = curl
|
|
rsp = Response(c, req)
|
|
rsp.url = cast(bytes, c.getinfo(CurlInfo.EFFECTIVE_URL)).decode()
|
|
if buffer:
|
|
rsp.content = buffer.getvalue() # type: ignore
|
|
rsp.status_code = cast(int, c.getinfo(CurlInfo.RESPONSE_CODE))
|
|
rsp.ok = 200 <= rsp.status_code < 400
|
|
header_lines = header_buffer.getvalue().splitlines()
|
|
|
|
# TODO history urls
|
|
header_list = []
|
|
for header_line in header_lines:
|
|
if not header_line.strip():
|
|
continue
|
|
if header_line.startswith(b"HTTP/"):
|
|
# read header from last response
|
|
rsp.reason = c.get_reason_phrase(header_line).decode()
|
|
# empty header list for new redirected response
|
|
header_list = [
|
|
h for h in header_lines if h.lower().startswith(b"set-cookie")
|
|
]
|
|
continue
|
|
header_list.append(header_line)
|
|
rsp.headers = Headers(header_list)
|
|
rsp.cookies = self.cookies
|
|
self.cookies.extract_cookies(rsp)
|
|
# print("Cookies after extraction", self.cookies)
|
|
|
|
content_type = rsp.headers.get("Content-Type", default="")
|
|
m = re.search(r"charset=([\w-]+)", content_type)
|
|
charset = m.group(1) if m else "utf-8"
|
|
|
|
rsp.charset = charset
|
|
rsp.encoding = charset # TODO use chardet
|
|
|
|
rsp.elapsed = cast(float, c.getinfo(CurlInfo.TOTAL_TIME))
|
|
rsp.redirect_count = cast(int, c.getinfo(CurlInfo.REDIRECT_COUNT))
|
|
rsp.redirect_url = cast(bytes, c.getinfo(CurlInfo.REDIRECT_URL)).decode()
|
|
|
|
return rsp
|
|
|
|
|
|
class Session(BaseSession):
|
|
def __init__(self, curl: Optional[Curl] = None, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.curl = curl if curl is not None else Curl()
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, *args):
|
|
self.close()
|
|
|
|
def close(self):
|
|
self.curl.close()
|
|
|
|
def request(
|
|
self,
|
|
method: str,
|
|
url: str,
|
|
params: Optional[dict] = None,
|
|
data: Optional[Union[Dict[str, str], str, BytesIO, bytes]] = None,
|
|
json: Optional[dict] = None,
|
|
headers: Optional[HeaderTypes] = None,
|
|
cookies: Optional[CookieTypes] = None,
|
|
files: Optional[Dict] = None,
|
|
auth: Optional[Tuple[str, str]] = None,
|
|
timeout: Optional[Union[float, Tuple[float, float]]] = None,
|
|
allow_redirects: bool = True,
|
|
max_redirects: Optional[int] = None,
|
|
proxies: Optional[dict] = None,
|
|
verify: Optional[bool] = None,
|
|
referer: Optional[str] = None,
|
|
accept_encoding: Optional[str] = "gzip, deflate, br",
|
|
content_callback: Optional[Callable] = None,
|
|
impersonate: Optional[Union[str, BrowserType]] = None,
|
|
) -> Response:
|
|
c = self.curl
|
|
req, buffer, header_buffer = self._set_curl_options(
|
|
c,
|
|
method,
|
|
url,
|
|
params,
|
|
data,
|
|
json,
|
|
headers,
|
|
cookies,
|
|
files,
|
|
auth,
|
|
timeout,
|
|
allow_redirects,
|
|
max_redirects,
|
|
proxies,
|
|
verify,
|
|
referer,
|
|
accept_encoding,
|
|
content_callback,
|
|
impersonate,
|
|
)
|
|
try:
|
|
c.perform()
|
|
except CurlError as e:
|
|
raise RequestsError(e)
|
|
|
|
rsp = self._parse_response(c, req, buffer, header_buffer)
|
|
self.curl.reset()
|
|
return rsp
|
|
|
|
head = partialmethod(request, "HEAD")
|
|
get = partialmethod(request, "GET")
|
|
post = partialmethod(request, "POST")
|
|
put = partialmethod(request, "PUT")
|
|
patch = partialmethod(request, "PATCH")
|
|
delete = partialmethod(request, "DELETE")
|
|
|
|
|
|
class AsyncSession(BaseSession):
|
|
def __init__(
|
|
self,
|
|
*,
|
|
loop=None,
|
|
async_curl: Optional[AsyncCurl] = None,
|
|
max_clients: int = 10,
|
|
**kwargs,
|
|
):
|
|
super().__init__(**kwargs)
|
|
self.loop = loop if loop is not None else asyncio.get_running_loop()
|
|
self.acurl = async_curl if async_curl is not None else AsyncCurl(loop=self.loop)
|
|
self.max_clients = max_clients
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.pool = asyncio.LifoQueue(self.max_clients)
|
|
while True:
|
|
try:
|
|
self.pool.put_nowait(None)
|
|
except asyncio.QueueFull:
|
|
break
|
|
self._running_curl = []
|
|
|
|
async def pop_curl(self):
|
|
curl = await self.pool.get()
|
|
if curl is None:
|
|
curl = Curl()
|
|
self._running_curl.append(curl)
|
|
return curl
|
|
|
|
def push_curl(self, curl):
|
|
try:
|
|
self.pool.put_nowait(curl)
|
|
except asyncio.QueueFull:
|
|
pass
|
|
|
|
async def __aenter__(self):
|
|
return self
|
|
|
|
async def __aexit__(self, *args):
|
|
self.close()
|
|
return None
|
|
|
|
def close(self):
|
|
self.acurl.close()
|
|
|
|
async def request(
|
|
self,
|
|
method: str,
|
|
url: str,
|
|
params: Optional[dict] = None,
|
|
data: Optional[Union[Dict[str, str], str, BytesIO, bytes]] = None,
|
|
json: Optional[dict] = None,
|
|
headers: Optional[HeaderTypes] = None,
|
|
cookies: Optional[CookieTypes] = None,
|
|
files: Optional[Dict] = None,
|
|
auth: Optional[Tuple[str, str]] = None,
|
|
timeout: Optional[Union[float, Tuple[float, float]]] = None,
|
|
allow_redirects: bool = True,
|
|
max_redirects: Optional[int] = None,
|
|
proxies: Optional[dict] = None,
|
|
verify: Optional[bool] = None,
|
|
referer: Optional[str] = None,
|
|
accept_encoding: Optional[str] = "gzip, deflate, br",
|
|
content_callback: Optional[Callable] = None,
|
|
impersonate: Optional[Union[str, BrowserType]] = None,
|
|
):
|
|
curl = await self.pop_curl()
|
|
req, buffer, header_buffer = self._set_curl_options(
|
|
curl,
|
|
method,
|
|
url,
|
|
params,
|
|
data,
|
|
json,
|
|
headers,
|
|
cookies,
|
|
files,
|
|
auth,
|
|
timeout,
|
|
allow_redirects,
|
|
max_redirects,
|
|
proxies,
|
|
verify,
|
|
referer,
|
|
accept_encoding,
|
|
content_callback,
|
|
impersonate,
|
|
)
|
|
try:
|
|
# curl.debug()
|
|
await self.acurl.add_handle(curl)
|
|
# print(curl.getinfo(CurlInfo.CAINFO))
|
|
curl.clean_after_perform()
|
|
except CurlError as e:
|
|
raise RequestsError(e)
|
|
rsp = self._parse_response(curl, req, buffer, header_buffer)
|
|
curl.reset()
|
|
self.push_curl(curl)
|
|
return rsp
|
|
|
|
head = partialmethod(request, "HEAD")
|
|
get = partialmethod(request, "GET")
|
|
post = partialmethod(request, "POST")
|
|
put = partialmethod(request, "PUT")
|
|
patch = partialmethod(request, "PATCH")
|
|
delete = partialmethod(request, "DELETE")
|