From ae6dab36b713596e073a4ed4defc3e671ebbe55d Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 18 Jun 2026 12:41:00 -0300 Subject: [PATCH 1/2] dotenv support --- CHANGES.rst | 10 ++++ docs/quickstart.rst | 21 +++++++ scrapinghub/client/__init__.py | 14 ++++- scrapinghub/client/utils.py | 56 +++++++++++++++--- setup.py | 3 +- tests/client/test_utils.py | 104 ++++++++++++++++++++++++++++++++- 6 files changed, 196 insertions(+), 12 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 6c1b026f..a0fbfad8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,16 @@ Release notes ============= +2.8.0 (unreleased) +------------------- + +- accept ``SHUB_APIKEY`` as an alias for the ``SH_APIKEY`` environment variable +- load the ``SH_APIKEY`` (or ``SHUB_APIKEY``) and ``SHUB_JOBAUTH`` environment + variables from a ``.env`` file via `python-dotenv`; use the new + ``dotenv_path`` argument of + :class:`~scrapinghub.client.ScrapinghubClient` to read a file other than the + default ``.env`` + 2.7.0 (2026-04-07) ------------------- diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 278161e9..b6f44494 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -32,6 +32,27 @@ Instantiate a new client with your Scrapy Cloud API key:: .. note:: Your Scrapy Cloud API key is available at the bottom of https://app.zyte.com/o/settings after you sign up. +If you instantiate the client without an explicit API key, it reads the +``SH_APIKEY`` (or its ``SHUB_APIKEY`` alias, or ``SHUB_JOBAUTH``) environment +variable instead:: + + >>> client = ScrapinghubClient() # reads SH_APIKEY from the environment + +Instead of exporting the variable yourself, you can store it in a ``.env`` +file and let the client load it:: + + SH_APIKEY=84c87545607a4bc0**************** + +By default the client reads the nearest ``.env`` file, looking in the current +directory and then walking up through its parent directories. Use the +``dotenv_path`` argument to point it at a different file:: + + >>> client = ScrapinghubClient(dotenv_path='/path/to/myenv') + +Only the ``SH_APIKEY``, ``SHUB_APIKEY`` and ``SHUB_JOBAUTH`` variables are read +from the file; any other variables it contains are ignored. A variable already +set in the environment takes precedence over the value in the file. + List your deployed projects:: >>> client.projects.list() diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index 89057af2..473ac6f3 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -31,7 +31,8 @@ class ScrapinghubClient(object): :param auth: (optional) Scrapy Cloud API key or other Scrapy Cloud auth credentials. If not provided, it will read, respectively, from - ``SH_APIKEY`` or ``SHUB_JOBAUTH`` environment variables. + ``SH_APIKEY`` (or its ``SHUB_APIKEY`` alias) or ``SHUB_JOBAUTH`` + environment variables. ``SHUB_JOBAUTH`` is available by default in *Scrapy Cloud*, but it does not provide access to all endpoints (e.g. job scheduling), but it is allowed to access job data, collections, crawl frontier. @@ -41,6 +42,12 @@ class ScrapinghubClient(object): :param dash_endpoint: (optional) Scrapy Cloud API URL. If not provided, it will be read from the ``SHUB_APIURL`` environment variable, or fall back to ``"https://app.zyte.com/api/"``. + :param dotenv_path: (optional) path to a ``.env`` file to read the + ``SH_APIKEY`` (or ``SHUB_APIKEY``) and ``SHUB_JOBAUTH`` credentials from + when ``auth`` is not provided and they are not set in the environment. + Defaults to the nearest ``.env`` file in the current directory or its + parents. Environment variables take precedence over the file, which is + never written to. :param kwargs: (optional) Additional arguments for :class:`~scrapinghub.hubstorage.HubstorageClient` constructor. @@ -56,9 +63,10 @@ class ScrapinghubClient(object): """ def __init__(self, auth=None, dash_endpoint=None, - connection_timeout=DEFAULT_CONNECTION_TIMEOUT, **kwargs): + connection_timeout=DEFAULT_CONNECTION_TIMEOUT, + dotenv_path=None, **kwargs): self.projects = Projects(self) - login, password = parse_auth(auth) + login, password = parse_auth(auth, dotenv_path=dotenv_path) timeout = connection_timeout or DEFAULT_CONNECTION_TIMEOUT self._connection = Connection(apikey=login, password=password, diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index e78d848d..dc89e2fb 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -8,6 +8,29 @@ from codecs import decode import six +from dotenv import dotenv_values, find_dotenv + + +#: API key environment variables read by :func:`parse_auth`, in priority order. +_APIKEY_VARS = ('SH_APIKEY', 'SHUB_APIKEY') + +#: Authentication environment variables read by :func:`parse_auth`. +_DOTENV_AUTH_VARS = _APIKEY_VARS + ('SHUB_JOBAUTH',) + + +def _read_dotenv_auth(dotenv_path=None): + """Read Scrapy Cloud auth credentials from a ``.env`` file. + + Only the ``SH_APIKEY``, ``SHUB_APIKEY`` and ``SHUB_JOBAUTH`` variables are + read from the file; any other variables it contains are ignored. Returns a + ``{var: value}`` dict with the variables found in the file. The process + environment is left untouched -- callers are expected to let real + environment variables take precedence over the returned values. When + ``dotenv_path`` is None, the nearest ``.env`` file in the current directory + or its parents is used. + """ + values = dotenv_values(dotenv_path or find_dotenv(usecwd=True)) + return {var: values[var] for var in _DOTENV_AUTH_VARS if values.get(var)} class LogLevel(object): @@ -88,9 +111,17 @@ def update_kwargs(kwargs, **params): for k, v in params.items() if v is not None}) -def parse_auth(auth): +def parse_auth(auth, dotenv_path=None): """Parse authentication token. + When ``auth`` is None, the credentials are read from the ``SH_APIKEY`` (or + its ``SHUB_APIKEY`` alias) or ``SHUB_JOBAUTH`` environment variables. If none + of them is set in the environment, they are read from a ``.env`` file + instead (see :func:`_read_dotenv_auth`); ``dotenv_path`` points at a file + other than the default ``.env``. Environment variables always take + precedence over the file, and the file is only read when needed -- the + environment is never modified. + >>> os.environ['SH_APIKEY'] = 'apikey' >>> parse_auth(None) ('apikey', '') @@ -104,18 +135,29 @@ def parse_auth(auth): ('1/2/3', 'some.jwt.token') """ if auth is None: - apikey = os.environ.get('SH_APIKEY') + apikey = next((os.environ[var] for var in _APIKEY_VARS + if os.environ.get(var)), None) + jobauth = os.environ.get('SHUB_JOBAUTH') + + # Fall back to the .env file only when the environment has no usable + # credentials, so an exported key never triggers a file lookup. + if not apikey and not jobauth: + dotenv = _read_dotenv_auth(dotenv_path) + apikey = next((dotenv[var] for var in _APIKEY_VARS + if dotenv.get(var)), None) + jobauth = dotenv.get('SHUB_JOBAUTH') + if apikey: return apikey, '' - jobauth = os.environ.get('SHUB_JOBAUTH') if jobauth: - warnings.warn("You are using the SHUB_JOBAUTH environment " - "variable which may not work for some API endpoints") + warnings.warn("You are using the SHUB_JOBAUTH credentials which " + "may not work for some API endpoints") return _search_for_jwt_credentials(jobauth) - raise RuntimeError("No API key provided and neither SH_APIKEY " - "nor SHUB_JOBAUTH environment variables is set") + raise RuntimeError("No API key provided and neither SH_APIKEY, " + "SHUB_APIKEY nor SHUB_JOBAUTH environment variables " + "is set") if isinstance(auth, tuple): all_strings = all(isinstance(k, six.string_types) for k in auth) diff --git a/setup.py b/setup.py index 8f73b841..315ea184 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,8 @@ platforms=['Any'], packages=['scrapinghub', 'scrapinghub.client', 'scrapinghub.hubstorage'], package_data={'scrapinghub': ['VERSION']}, - install_requires=['requests>=1.0', 'retrying>=1.3.3', 'six>=1.10.0'], + install_requires=['python-dotenv>=1.0.0', 'requests>=1.0', + 'retrying>=1.3.3', 'six>=1.10.0'], extras_require={'msgpack': mpack_required}, python_requires='>=3.10', classifiers=[ diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 253e1b4a..63d1b160 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -4,7 +4,19 @@ import mock -from scrapinghub.client.utils import parse_auth, parse_job_key +from scrapinghub.client.utils import ( + parse_auth, parse_job_key, _read_dotenv_auth, +) + + +@pytest.fixture(autouse=True) +def isolated_auth_env(tmp_path, monkeypatch): + """Keep auth resolution hermetic: drop any ambient auth env vars and run + from an empty directory so ``find_dotenv()`` can't pick up a stray ``.env`` + from the developer's working tree.""" + for var in ('SH_APIKEY', 'SHUB_APIKEY', 'SHUB_JOBAUTH'): + monkeypatch.delenv(var, raising=False) + monkeypatch.chdir(tmp_path) def test_parse_auth_none(): @@ -22,6 +34,16 @@ def test_parse_auth_none_with_multiple_env(): assert parse_auth(None) == ('testkey', '') +@mock.patch.dict(os.environ, {'SHUB_APIKEY': 'aliaskey'}) +def test_parse_auth_none_with_shub_apikey_alias(): + assert parse_auth(None) == ('aliaskey', '') + + +@mock.patch.dict(os.environ, {'SH_APIKEY': 'primary', 'SHUB_APIKEY': 'alias'}) +def test_parse_auth_sh_apikey_takes_precedence_over_alias(): + assert parse_auth(None) == ('primary', '') + + def test_parse_auth_tuple(): assert parse_auth(('test', 'test')) == ('test', 'test') assert parse_auth(('apikey', '')) == ('apikey', '') @@ -79,6 +101,86 @@ def test_parse_auth_none_with_jwt_token_env(): assert parse_auth(None) == (test_job, test_token) +def test_read_dotenv_auth_default_path(tmp_path): + (tmp_path / '.env').write_text('SH_APIKEY=FROMDOTENV\n') + + assert _read_dotenv_auth() == {'SH_APIKEY': 'FROMDOTENV'} + assert 'SH_APIKEY' not in os.environ # reading the file must not touch env + + +def test_read_dotenv_auth_parent_dir(tmp_path, monkeypatch): + (tmp_path / '.env').write_text('SHUB_APIKEY=FROMPARENT\n') + subdir = tmp_path / 'project' / 'subdir' + subdir.mkdir(parents=True) + monkeypatch.chdir(subdir) + + assert _read_dotenv_auth() == {'SHUB_APIKEY': 'FROMPARENT'} + + +def test_read_dotenv_auth_custom_path(tmp_path): + env_file = tmp_path / 'custom.env' + env_file.write_text('SH_APIKEY=CUSTOMKEY\nSHUB_JOBAUTH=CUSTOMJWT\n') + + assert _read_dotenv_auth(str(env_file)) == { + 'SH_APIKEY': 'CUSTOMKEY', 'SHUB_JOBAUTH': 'CUSTOMJWT', + } + + +def test_read_dotenv_auth_only_reads_auth_vars(tmp_path): + env_file = tmp_path / 'custom.env' + env_file.write_text('SH_APIKEY=ONLYTHIS\nOTHER_VAR=ignored\n') + + assert _read_dotenv_auth(str(env_file)) == {'SH_APIKEY': 'ONLYTHIS'} + assert 'OTHER_VAR' not in os.environ + + +def test_read_dotenv_auth_missing_file(tmp_path): + assert _read_dotenv_auth(str(tmp_path / 'does-not-exist.env')) == {} + + +def test_parse_auth_none_reads_dotenv(tmp_path): + env_file = tmp_path / 'custom.env' + env_file.write_text('SH_APIKEY=DOTENVKEY\n') + + assert parse_auth(None, dotenv_path=str(env_file)) == ('DOTENVKEY', '') + + +def test_parse_auth_none_reads_shub_apikey_alias_from_dotenv(tmp_path): + env_file = tmp_path / 'custom.env' + env_file.write_text('SHUB_APIKEY=ALIASFROMFILE\n') + + assert parse_auth(None, dotenv_path=str(env_file)) == ('ALIASFROMFILE', '') + + +def test_parse_auth_none_reads_jobauth_from_dotenv(tmp_path): + test_job, test_token = '1/2/3', 'some.jwt.token' + raw_token = (test_job + ':' + test_token).encode('utf8') + encoded_token = encode(raw_token, 'hex_codec').decode('ascii') + env_file = tmp_path / 'custom.env' + env_file.write_text('SHUB_JOBAUTH={}\n'.format(encoded_token)) + + with pytest.warns(UserWarning): + assert parse_auth(None, dotenv_path=str(env_file)) == (test_job, test_token) + + +def test_parse_auth_env_takes_precedence_over_dotenv(tmp_path, monkeypatch): + monkeypatch.setenv('SH_APIKEY', 'FROMENV') + env_file = tmp_path / 'custom.env' + env_file.write_text('SH_APIKEY=FROMDOTENV\n') + + assert parse_auth(None, dotenv_path=str(env_file)) == ('FROMENV', '') + + +def test_parse_auth_none_does_not_mutate_environ(tmp_path): + env_file = tmp_path / 'custom.env' + env_file.write_text('SH_APIKEY=DOTENVKEY\nSHUB_JOBAUTH=JWT\n') + + parse_auth(None, dotenv_path=str(env_file)) + + assert 'SH_APIKEY' not in os.environ + assert 'SHUB_JOBAUTH' not in os.environ + + def test_parse_job_key(): job_key = parse_job_key('123/10/11') assert job_key.project_id == '123' From 3cd2cc531649224ea1ce0efb6571d3cfa20a5820 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 19 Jun 2026 11:49:14 -0300 Subject: [PATCH 2/2] docs update --- docs/quickstart.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index b6f44494..99c94066 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -41,6 +41,9 @@ variable instead:: Instead of exporting the variable yourself, you can store it in a ``.env`` file and let the client load it:: +.. code-block:: bash + :caption: :file:`.env` + SH_APIKEY=84c87545607a4bc0**************** By default the client reads the nearest ``.env`` file, looking in the current