import sys import contextlib import inspect import io import json import os import re import ssl import time # We need files() from Python 3.10 or higher if sys.version_info >= (3, 10): import importlib.resources as ilr else: import importlib_resources as ilr from urllib.error import URLError from urllib.parse import quote_plus from urllib import request from fake_useragent.log import logger # Fallback method for retrieving data file try: from pkg_resources import resource_filename except: pass str_types = (str,) text = str urlopen_args = inspect.getfullargspec(request.urlopen).kwonlyargs urlopen_has_ssl_context = "context" in urlopen_args def get(url, verify_ssl=True): attempt = 0 while True: requestObj = request.Request(url) attempt += 1 try: if urlopen_has_ssl_context: if not verify_ssl: context = ssl._create_unverified_context() else: context = None with contextlib.closing( request.urlopen( requestObj, timeout=settings.HTTP_TIMEOUT, context=context, ) ) as response: return response.read() else: # ssl context is not supported ;( with contextlib.closing( request.urlopen( requestObj, timeout=settings.HTTP_TIMEOUT, ) ) as response: return response.read() except (URLError, OSError) as exc: logger.debug( "Error occurred during fetching %s", url, exc_info=exc, ) if attempt == settings.HTTP_RETRIES: raise FakeUserAgentError("Maximum amount of retries reached") else: logger.debug( "Sleeping for %s seconds", settings.HTTP_DELAY, ) time.sleep(settings.HTTP_DELAY) def get_browser_user_agents_online(browser, verify_ssl=True): """ Retrieve browser user agent strings from website """ html = get( settings.BROWSER_BASE_PAGE.format(browser=quote_plus(browser)), verify_ssl=verify_ssl, ) try: html = html.decode("utf-8") except (UnicodeDecodeError, AttributeError): pass html = html.split("
")[1] html = html.split("
")[0] pattern = r"(.+?)" browsers_iter = re.finditer(pattern, html, re.UNICODE) browsers = [] for browser in browsers_iter: if "more" in browser.group(1).lower(): continue browsers.append(browser.group(1)) if len(browsers) == settings.BROWSERS_COUNT_LIMIT: break if not browsers: raise FakeUserAgentError( "No browser user-agent strings found for browser: {browser}".format( browser=browser ) ) return browsers def load(browsers, use_local_file=True, verify_ssl=True): data = {} fetch_online = True if use_local_file: try: json_lines = ( ilr.files("fake_useragent.data").joinpath("browsers.json").read_text() ) for line in json_lines.splitlines(): data.update(json.loads(line)) fetch_online = False ret = data except Exception as exc: # Empty data just to be sure data = {} logger.warning( "Unable to find local data/json file or could not parse the contents using importlib-resources. Try pkg-resource next.", exc_info=exc, ) try: with open( resource_filename("fake_useragent", "data/browsers.json") ) as file: json_lines = file.read() for line in json_lines.splitlines(): data.update(json.loads(line)) fetch_online = False ret = data except Exception as exc2: # Empty data just to be sure data = {} logger.warning( "Could not find local data/json file or could not parse the contents using pkg-resource. Fallback to external resource.", exc_info=exc2, ) # Fallback behaviour or use_external_data parameter is explicitly set to True if fetch_online: try: # For each browser receive the user-agent strings for browser_name in browsers: browser_name = browser_name.lower().strip() data[browser_name] = get_browser_user_agents_online( browser_name, verify_ssl=verify_ssl, ) except Exception: raise FakeUserAgentError("Could not load data from external website") else: ret = data if not ret: raise FakeUserAgentError("Data dictionary is empty", ret) if not isinstance(ret, dict): raise FakeUserAgentError("Data is not dictionary ", ret) return ret def write(path, data): with open(path, encoding="utf-8", mode="w") as fp: dumped = json.dumps(data) if not isinstance(dumped, text): # Python 2 dumped = dumped.decode("utf-8") fp.write(dumped) def read(path): with open(path, encoding="utf-8") as fp: return json.loads(fp.read()) def exist(path): return os.path.isfile(path) def rm(path): if exist(path): os.remove(path) def update(cache_path, browsers, verify_ssl=True): rm(cache_path) write(cache_path, load(browsers, use_local_file=False, verify_ssl=verify_ssl)) def load_cached(cache_path, browsers, verify_ssl=True): if not exist(cache_path): update(cache_path, browsers, verify_ssl=verify_ssl) return read(cache_path) from fake_useragent import settings # noqa # isort:skip from fake_useragent.errors import FakeUserAgentError # noqa # isort:skip