Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
Release notes
=============

2.8.0 (unreleased)
-------------------

- accept ``SHUB_APIKEY`` as an alias for the ``SH_APIKEY`` environment variable
- load the ``SH_APIKEY`` (or ``SHUB_APIKEY``) and ``SHUB_JOBAUTH`` environment
variables from a ``.env`` file via `python-dotenv`; use the new
``dotenv_path`` argument of
:class:`~scrapinghub.client.ScrapinghubClient` to read a file other than the
default ``.env``

2.7.0 (2026-04-07)
-------------------

Expand Down
24 changes: 24 additions & 0 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,30 @@ Instantiate a new client with your Scrapy Cloud API key::
.. note:: Your Scrapy Cloud API key is available at the bottom of
https://app.zyte.com/o/settings after you sign up.

If you instantiate the client without an explicit API key, it reads the
``SH_APIKEY`` (or its ``SHUB_APIKEY`` alias, or ``SHUB_JOBAUTH``) environment
variable instead::

>>> client = ScrapinghubClient() # reads SH_APIKEY from the environment

Instead of exporting the variable yourself, you can store it in a ``.env``
file and let the client load it::

.. code-block:: bash
:caption: :file:`.env`

SH_APIKEY=84c87545607a4bc0****************
Comment thread
elacuesta marked this conversation as resolved.

By default the client reads the nearest ``.env`` file, looking in the current
directory and then walking up through its parent directories. Use the
``dotenv_path`` argument to point it at a different file::

>>> client = ScrapinghubClient(dotenv_path='/path/to/myenv')

Only the ``SH_APIKEY``, ``SHUB_APIKEY`` and ``SHUB_JOBAUTH`` variables are read
from the file; any other variables it contains are ignored. A variable already
set in the environment takes precedence over the value in the file.

List your deployed projects::

>>> client.projects.list()
Expand Down
14 changes: 11 additions & 3 deletions scrapinghub/client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ class ScrapinghubClient(object):

:param auth: (optional) Scrapy Cloud API key or other Scrapy Cloud auth
credentials. If not provided, it will read, respectively, from
``SH_APIKEY`` or ``SHUB_JOBAUTH`` environment variables.
``SH_APIKEY`` (or its ``SHUB_APIKEY`` alias) or ``SHUB_JOBAUTH``
environment variables.
``SHUB_JOBAUTH`` is available by default in *Scrapy Cloud*, but it does
not provide access to all endpoints (e.g. job scheduling), but it is allowed
to access job data, collections, crawl frontier.
Expand All @@ -41,6 +42,12 @@ class ScrapinghubClient(object):
:param dash_endpoint: (optional) Scrapy Cloud API URL.
If not provided, it will be read from the ``SHUB_APIURL`` environment
variable, or fall back to ``"https://app.zyte.com/api/"``.
:param dotenv_path: (optional) path to a ``.env`` file to read the
``SH_APIKEY`` (or ``SHUB_APIKEY``) and ``SHUB_JOBAUTH`` credentials from
when ``auth`` is not provided and they are not set in the environment.
Defaults to the nearest ``.env`` file in the current directory or its
parents. Environment variables take precedence over the file, which is
never written to.
:param kwargs: (optional) Additional arguments for
:class:`~scrapinghub.hubstorage.HubstorageClient` constructor.

Expand All @@ -56,9 +63,10 @@ class ScrapinghubClient(object):
"""

def __init__(self, auth=None, dash_endpoint=None,
connection_timeout=DEFAULT_CONNECTION_TIMEOUT, **kwargs):
connection_timeout=DEFAULT_CONNECTION_TIMEOUT,
dotenv_path=None, **kwargs):
self.projects = Projects(self)
login, password = parse_auth(auth)
login, password = parse_auth(auth, dotenv_path=dotenv_path)
timeout = connection_timeout or DEFAULT_CONNECTION_TIMEOUT
self._connection = Connection(apikey=login,
password=password,
Expand Down
56 changes: 49 additions & 7 deletions scrapinghub/client/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,29 @@
from codecs import decode

import six
from dotenv import dotenv_values, find_dotenv


#: API key environment variables read by :func:`parse_auth`, in priority order.
_APIKEY_VARS = ('SH_APIKEY', 'SHUB_APIKEY')

#: Authentication environment variables read by :func:`parse_auth`.
_DOTENV_AUTH_VARS = _APIKEY_VARS + ('SHUB_JOBAUTH',)


def _read_dotenv_auth(dotenv_path=None):
"""Read Scrapy Cloud auth credentials from a ``.env`` file.

Only the ``SH_APIKEY``, ``SHUB_APIKEY`` and ``SHUB_JOBAUTH`` variables are
read from the file; any other variables it contains are ignored. Returns a
``{var: value}`` dict with the variables found in the file. The process
environment is left untouched -- callers are expected to let real
environment variables take precedence over the returned values. When
``dotenv_path`` is None, the nearest ``.env`` file in the current directory
or its parents is used.
"""
values = dotenv_values(dotenv_path or find_dotenv(usecwd=True))
return {var: values[var] for var in _DOTENV_AUTH_VARS if values.get(var)}


class LogLevel(object):
Expand Down Expand Up @@ -88,9 +111,17 @@ def update_kwargs(kwargs, **params):
for k, v in params.items() if v is not None})


def parse_auth(auth):
def parse_auth(auth, dotenv_path=None):
"""Parse authentication token.

When ``auth`` is None, the credentials are read from the ``SH_APIKEY`` (or
its ``SHUB_APIKEY`` alias) or ``SHUB_JOBAUTH`` environment variables. If none
of them is set in the environment, they are read from a ``.env`` file
instead (see :func:`_read_dotenv_auth`); ``dotenv_path`` points at a file
other than the default ``.env``. Environment variables always take
precedence over the file, and the file is only read when needed -- the
environment is never modified.

>>> os.environ['SH_APIKEY'] = 'apikey'
>>> parse_auth(None)
('apikey', '')
Expand All @@ -104,18 +135,29 @@ def parse_auth(auth):
('1/2/3', 'some.jwt.token')
"""
if auth is None:
apikey = os.environ.get('SH_APIKEY')
apikey = next((os.environ[var] for var in _APIKEY_VARS
if os.environ.get(var)), None)
jobauth = os.environ.get('SHUB_JOBAUTH')

# Fall back to the .env file only when the environment has no usable
# credentials, so an exported key never triggers a file lookup.
if not apikey and not jobauth:
dotenv = _read_dotenv_auth(dotenv_path)
apikey = next((dotenv[var] for var in _APIKEY_VARS
if dotenv.get(var)), None)
jobauth = dotenv.get('SHUB_JOBAUTH')
Comment on lines +142 to +148

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if SH_APIKEY is set as an env var but SHUB_JOBAUTH is not, a .env SH_APIKEY takes over the env var?


if apikey:
return apikey, ''

jobauth = os.environ.get('SHUB_JOBAUTH')
if jobauth:
warnings.warn("You are using the SHUB_JOBAUTH environment "
"variable which may not work for some API endpoints")
warnings.warn("You are using the SHUB_JOBAUTH credentials which "
"may not work for some API endpoints")
return _search_for_jwt_credentials(jobauth)

raise RuntimeError("No API key provided and neither SH_APIKEY "
"nor SHUB_JOBAUTH environment variables is set")
raise RuntimeError("No API key provided and neither SH_APIKEY, "
"SHUB_APIKEY nor SHUB_JOBAUTH environment variables "
"is set")

if isinstance(auth, tuple):
all_strings = all(isinstance(k, six.string_types) for k in auth)
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
platforms=['Any'],
packages=['scrapinghub', 'scrapinghub.client', 'scrapinghub.hubstorage'],
package_data={'scrapinghub': ['VERSION']},
install_requires=['requests>=1.0', 'retrying>=1.3.3', 'six>=1.10.0'],
install_requires=['python-dotenv>=1.0.0', 'requests>=1.0',
'retrying>=1.3.3', 'six>=1.10.0'],
extras_require={'msgpack': mpack_required},
python_requires='>=3.10',
classifiers=[
Expand Down
104 changes: 103 additions & 1 deletion tests/client/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,19 @@

import mock

from scrapinghub.client.utils import parse_auth, parse_job_key
from scrapinghub.client.utils import (
parse_auth, parse_job_key, _read_dotenv_auth,
)


@pytest.fixture(autouse=True)
def isolated_auth_env(tmp_path, monkeypatch):
"""Keep auth resolution hermetic: drop any ambient auth env vars and run
from an empty directory so ``find_dotenv()`` can't pick up a stray ``.env``
from the developer's working tree."""
for var in ('SH_APIKEY', 'SHUB_APIKEY', 'SHUB_JOBAUTH'):
monkeypatch.delenv(var, raising=False)
monkeypatch.chdir(tmp_path)


def test_parse_auth_none():
Expand All @@ -22,6 +34,16 @@ def test_parse_auth_none_with_multiple_env():
assert parse_auth(None) == ('testkey', '')


@mock.patch.dict(os.environ, {'SHUB_APIKEY': 'aliaskey'})
def test_parse_auth_none_with_shub_apikey_alias():
assert parse_auth(None) == ('aliaskey', '')


@mock.patch.dict(os.environ, {'SH_APIKEY': 'primary', 'SHUB_APIKEY': 'alias'})
def test_parse_auth_sh_apikey_takes_precedence_over_alias():
assert parse_auth(None) == ('primary', '')


def test_parse_auth_tuple():
assert parse_auth(('test', 'test')) == ('test', 'test')
assert parse_auth(('apikey', '')) == ('apikey', '')
Expand Down Expand Up @@ -79,6 +101,86 @@ def test_parse_auth_none_with_jwt_token_env():
assert parse_auth(None) == (test_job, test_token)


def test_read_dotenv_auth_default_path(tmp_path):
(tmp_path / '.env').write_text('SH_APIKEY=FROMDOTENV\n')

assert _read_dotenv_auth() == {'SH_APIKEY': 'FROMDOTENV'}
assert 'SH_APIKEY' not in os.environ # reading the file must not touch env


def test_read_dotenv_auth_parent_dir(tmp_path, monkeypatch):
(tmp_path / '.env').write_text('SHUB_APIKEY=FROMPARENT\n')
subdir = tmp_path / 'project' / 'subdir'
subdir.mkdir(parents=True)
monkeypatch.chdir(subdir)

assert _read_dotenv_auth() == {'SHUB_APIKEY': 'FROMPARENT'}


def test_read_dotenv_auth_custom_path(tmp_path):
env_file = tmp_path / 'custom.env'
env_file.write_text('SH_APIKEY=CUSTOMKEY\nSHUB_JOBAUTH=CUSTOMJWT\n')

assert _read_dotenv_auth(str(env_file)) == {
'SH_APIKEY': 'CUSTOMKEY', 'SHUB_JOBAUTH': 'CUSTOMJWT',
}


def test_read_dotenv_auth_only_reads_auth_vars(tmp_path):
env_file = tmp_path / 'custom.env'
env_file.write_text('SH_APIKEY=ONLYTHIS\nOTHER_VAR=ignored\n')

assert _read_dotenv_auth(str(env_file)) == {'SH_APIKEY': 'ONLYTHIS'}
assert 'OTHER_VAR' not in os.environ


def test_read_dotenv_auth_missing_file(tmp_path):
assert _read_dotenv_auth(str(tmp_path / 'does-not-exist.env')) == {}


def test_parse_auth_none_reads_dotenv(tmp_path):
env_file = tmp_path / 'custom.env'
env_file.write_text('SH_APIKEY=DOTENVKEY\n')

assert parse_auth(None, dotenv_path=str(env_file)) == ('DOTENVKEY', '')


def test_parse_auth_none_reads_shub_apikey_alias_from_dotenv(tmp_path):
env_file = tmp_path / 'custom.env'
env_file.write_text('SHUB_APIKEY=ALIASFROMFILE\n')

assert parse_auth(None, dotenv_path=str(env_file)) == ('ALIASFROMFILE', '')


def test_parse_auth_none_reads_jobauth_from_dotenv(tmp_path):
test_job, test_token = '1/2/3', 'some.jwt.token'
raw_token = (test_job + ':' + test_token).encode('utf8')
encoded_token = encode(raw_token, 'hex_codec').decode('ascii')
env_file = tmp_path / 'custom.env'
env_file.write_text('SHUB_JOBAUTH={}\n'.format(encoded_token))

with pytest.warns(UserWarning):
assert parse_auth(None, dotenv_path=str(env_file)) == (test_job, test_token)


def test_parse_auth_env_takes_precedence_over_dotenv(tmp_path, monkeypatch):
monkeypatch.setenv('SH_APIKEY', 'FROMENV')
env_file = tmp_path / 'custom.env'
env_file.write_text('SH_APIKEY=FROMDOTENV\n')

assert parse_auth(None, dotenv_path=str(env_file)) == ('FROMENV', '')


def test_parse_auth_none_does_not_mutate_environ(tmp_path):
env_file = tmp_path / 'custom.env'
env_file.write_text('SH_APIKEY=DOTENVKEY\nSHUB_JOBAUTH=JWT\n')

parse_auth(None, dotenv_path=str(env_file))

assert 'SH_APIKEY' not in os.environ
assert 'SHUB_JOBAUTH' not in os.environ


def test_parse_job_key():
job_key = parse_job_key('123/10/11')
assert job_key.project_id == '123'
Expand Down
Loading