diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 7668179b0..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,16 +0,0 @@ -version: 2 - -jobs: - build: - docker: - - image: circleci/python:3.6.2 - steps: - - checkout - - run: - command: ./continuous_integration/build_doc.sh - environment: - MINICONDA_PATH: ~/miniconda - CONDA_ENV_NAME: testenv - - store_artifacts: - path: doc/_build/html - destination: doc diff --git a/.readthedocs-requirements.txt b/.readthedocs-requirements.txt index 3699035ec..05dd31dee 100644 --- a/.readthedocs-requirements.txt +++ b/.readthedocs-requirements.txt @@ -1,4 +1,5 @@ sphinx +docutils<0.18 numpy matplotlib pillow diff --git a/CHANGES.rst b/CHANGES.rst index 5328c2c79..d20a7a388 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,17 +1,78 @@ Latest changes ============== -1.0.1 ------ +Release 1.2.0 +------------- + +- Fix a security issue where ``eval(pre_dispatch)`` could potentially run + arbitrary code. Now only basic numerics are supported. + https://github.com/joblib/joblib/pull/1327 + +- Make sure that joblib works even when multiprocessing is not available, + for instance with Pyodide + https://github.com/joblib/joblib/pull/1256 + +- Avoid unnecessary warnings when workers and main process delete + the temporary memmap folder contents concurrently. + https://github.com/joblib/joblib/pull/1263 + +- Vendor loky 3.1.0 with several fixes to more robustly forcibly terminate + worker processes in case of a crash. + https://github.com/joblib/joblib/pull/1269 + +- Fix memory alignment bug for pickles containing numpy arrays. + This is especially important when loading the pickle with + ``mmap_mode != None`` as the resulting ``numpy.memmap`` object + would not be able to correct the misalignment without performing + a memory copy. + This bug would cause invalid computation and segmentation faults + with native code that would directly access the underlying data + buffer of a numpy array, for instance C/C++/Cython code compiled + with older GCC versions or some old OpenBLAS written in platform + specific assembly. + https://github.com/joblib/joblib/pull/1254 + +- Vendor cloudpickle 2.2.0 which adds support for PyPy 3.8+. + +- Vendor loky 3.3.0 which fixes a bug with leaking processes in case of + nested loky parallel calls and more reliability spawn the correct + number of reusable workers. + +Release 1.1.0 +-------------- + +- Fix byte order inconsistency issue during deserialization using joblib.load + in cross-endian environment: the numpy arrays are now always loaded to + use the system byte order, independently of the byte order of the system + that serialized the pickle. + https://github.com/joblib/joblib/pull/1181 + +- Fix joblib.Memory bug with the ``ignore`` parameter when the cached function + is a decorated function. + https://github.com/joblib/joblib/pull/1165 +- Fix `joblib.Memory` to properly handle caching for functions defined + interactively in a IPython session or in Jupyter notebook cell. + https://github.com/joblib/joblib/pull/1214 + +- Update vendored loky (from version 2.9 to 3.0) and cloudpickle (from + version 1.6 to 2.0) + https://github.com/joblib/joblib/pull/1218 + +Release 1.0.1 +------------- + +- Add check_call_in_cache method to check cache without calling function. + https://github.com/joblib/joblib/pull/820 + - dask: avoid redundant scattering of large arguments to make a more efficient use of the network resources and avoid crashing dask with "OSError: [Errno 55] No buffer space available" or "ConnectionResetError: [Errno 104] connection reset by peer". https://github.com/joblib/joblib/pull/1133 -1.0.0 ------ +Release 1.0.0 +------------- - Make `joblib.hash` and `joblib.Memory` caching system compatible with `numpy >= 1.20.0`. Also make it explicit in the documentation that users should now @@ -245,7 +306,7 @@ Maxime Weyl Maxime Weyl Loading a corrupted cached file with mmap mode enabled would - recompute the results and return them without memmory mapping. + recompute the results and return them without memory mapping. Release 0.12.3 @@ -329,8 +390,8 @@ Thomas Moreau Implement the ``'loky'`` backend with @ogrisel. This backend relies on a robust implementation of ``concurrent.futures.ProcessPoolExecutor`` - with spawned processes that can be reused accross the ``Parallel`` - calls. This fixes the bad interation with third paty libraries relying on + with spawned processes that can be reused across the ``Parallel`` + calls. This fixes the bad integration with third paty libraries relying on thread pools, described in https://pythonhosted.org/joblib/parallel.html#bad-interaction-of-multiprocessing-and-third-party-libraries Limit the number of threads used in worker processes by C-libraries that @@ -390,7 +451,7 @@ Alexandre Abadie Add ``register_compressor`` function for extending available compressors. - Allow passing a string to ``compress`` parameter in ``dump`` funtion. This + Allow passing a string to ``compress`` parameter in ``dump`` function. This string should correspond to the compressor used (e.g. zlib, gzip, lz4, etc). The default compression level is used in this case. @@ -440,7 +501,7 @@ Loïc Estève Loïc Estève Fix handling of memmap objects with offsets greater than - mmap.ALLOCATIONGRANULARITY in ``joblib.Parrallel``. See + mmap.ALLOCATIONGRANULARITY in ``joblib.Parallel``. See https://github.com/joblib/joblib/issues/451 for more details. Loïc Estève @@ -856,7 +917,7 @@ Release 0.6.5 2012-09-15 Yannick Schwartz - BUG: make sure that sets and dictionnaries give reproducible hashes + BUG: make sure that sets and dictionaries give reproducible hashes 2012-07-18 @@ -887,7 +948,7 @@ GaelVaroquaux BUG: non-reproducible hashing: order of kwargs - The ordering of a dictionnary is random. As a result the function hashing + The ordering of a dictionary is random. As a result the function hashing was not reproducible. Pretty hard to test Release 0.6.3 @@ -1039,7 +1100,7 @@ Release 0.5.3 2011-06-25 Gael varoquaux - API: All the usefull symbols in the __init__ + API: All the useful symbols in the __init__ Release 0.5.2 @@ -1197,7 +1258,7 @@ Gael varoquaux Gael varoquaux 2010-07-29 - MISC: Silence tests (and hopefuly Yaroslav :P) + MISC: Silence tests (and hopefully Yaroslav :P) Release 0.4.3 ---------------- diff --git a/LICENSE.txt b/LICENSE.txt index 0f469af82..910537bd3 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2008-2016, The joblib developers. +Copyright (c) 2008-2021, The joblib developers. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.rst b/README.rst index f1aac980c..f9defa1db 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|PyPi| |Azure| |Codecov| +|PyPi| |Azure| |ReadTheDocs| |Codecov| .. |PyPi| image:: https://badge.fury.io/py/joblib.svg :target: https://badge.fury.io/py/joblib @@ -6,7 +6,11 @@ .. |Azure| image:: https://dev.azure.com/joblib/joblib/_apis/build/status/joblib.joblib?branchName=master :target: https://dev.azure.com/joblib/joblib/_build?definitionId=3&_a=summary&branchFilter=40 - :alt: Codecov coverage + :alt: Azure CI status + +.. |ReadTheDocs| image:: https://readthedocs.org/projects/joblib/badge/?version=latest + :target: https://joblib.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status .. |Codecov| image:: https://codecov.io/gh/joblib/joblib/branch/master/graph/badge.svg :target: https://codecov.io/gh/joblib/joblib @@ -44,7 +48,7 @@ Dependencies ============ - Joblib has no mandatory dependencies besides Python (supported versions are - 2.7+ and 3.4+). + 3.7+). - Joblib has an optional dependency on Numpy (at least version 1.6.1) for array manipulation. - Joblib includes its own vendored copy of @@ -130,40 +134,3 @@ but, the following git command may be used to generate the lines:: git log --abbrev-commit --date=short --no-merges --sparse -Licensing ---------- - -joblib is **BSD-licenced** (3 clause): - - This software is OSI Certified Open Source Software. - OSI Certified is a certification mark of the Open Source Initiative. - - Copyright (c) 2009-2011, joblib developpers - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of Gael Varoquaux. nor the names of other joblib - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - **This software is provided by the copyright holders and contributors - "as is" and any express or implied warranties, including, but not - limited to, the implied warranties of merchantability and fitness for - a particular purpose are disclaimed. In no event shall the copyright - owner or contributors be liable for any direct, indirect, incidental, - special, exemplary, or consequential damages (including, but not - limited to, procurement of substitute goods or services; loss of use, - data, or profits; or business interruption) however caused and on any - theory of liability, whether in contract, strict liability, or tort - (including negligence or otherwise) arising in any way out of the use - of this software, even if advised of the possibility of such - damage.** diff --git a/TODO.rst b/TODO.rst index 83571634e..0028cc37b 100644 --- a/TODO.rst +++ b/TODO.rst @@ -40,7 +40,7 @@ Tasks at hand on joblib, in increasing order of difficulty. * add a 'argument_hash' keyword argument to Memory.cache, to be able to replace the hashing logic of memory for the input arguments. It should - accept as an input the dictionnary of arguments, as returned in + accept as an input the dictionary of arguments, as returned in func_inspect, and return a string. * add a sqlite db for provenance tracking. Store computation time and usage diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 845f3e300..3b58a4d3c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -3,6 +3,13 @@ # Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: # https://docs.microsoft.com/azure/devops/pipelines/languages/python +schedules: +- cron: "0 9 * * *" + displayName: Daily build + branches: + include: + - master + trigger: - master @@ -38,49 +45,48 @@ jobs: PYTHON_VERSION: "pypy3" LOKY_MAX_CPU_COUNT: "2" - linux_py38_distributed: + linux_py39_sklearn_tests: + imageName: 'ubuntu-latest' + PYTHON_VERSION: "3.9" + # SKIP_TESTS: "true" + SKLEARN_TESTS: "true" + linux_py310_distributed: # To be updated regularly to use the most recent versions of the # dependencies. imageName: 'ubuntu-latest' - PYTHON_VERSION: "3.8" - EXTRA_CONDA_PACKAGES: "numpy=1.18 distributed=2.17" - linux_py37_sklearn_tests: - imageName: 'ubuntu-latest' - PYTHON_VERSION: "3.7" - EXTRA_CONDA_PACKAGES: "numpy=1.16" - SKIP_TESTS: "true" - SKLEARN_TESTS: "true" + PYTHON_VERSION: "3.10" + EXTRA_CONDA_PACKAGES: "numpy=1.23 distributed=2022.2.0" linux_py37_distributed: imageName: 'ubuntu-latest' PYTHON_VERSION: "3.7" EXTRA_CONDA_PACKAGES: "numpy=1.15 distributed=2.13" - linux_py36_cython: + linux_py310_cython: imageName: 'ubuntu-latest' - PYTHON_VERSION: "3.6" - EXTRA_CONDA_PACKAGES: "numpy=1.14" + PYTHON_VERSION: "3.10" + EXTRA_CONDA_PACKAGES: "numpy=1.23" CYTHON: "true" - linux_py36_no_multiprocessing_no_lzma: + linux_py37_no_multiprocessing_no_lzma: imageName: 'ubuntu-latest' - PYTHON_VERSION: "3.6" - EXTRA_CONDA_PACKAGES: "numpy=1.14" + PYTHON_VERSION: "3.7" + EXTRA_CONDA_PACKAGES: "numpy=1.15" JOBLIB_MULTIPROCESSING: "0" NO_LZMA: "1" - linux_py36_no_numpy: + linux_py37_no_numpy: imageName: 'ubuntu-latest' - PYTHON_VERSION: "3.6" - - windows_py38: - imageName: "vs2017-win2016" - PYTHON_VERSION: "3.8" - EXTRA_CONDA_PACKAGES: "numpy=1.18" - - macos_py38: - imageName: "macos-10.14" - PYTHON_VERSION: "3.8" - EXTRA_CONDA_PACKAGES: "numpy=1.18" - macos_py36_no_numpy: - imageName: "macos-10.14" - PYTHON_VERSION: "3.6" + PYTHON_VERSION: "3.7" + + windows_py310: + imageName: "windows-latest" + PYTHON_VERSION: "3.10" + EXTRA_CONDA_PACKAGES: "numpy=1.23" + + macos_py310: + imageName: "macos-latest" + PYTHON_VERSION: "3.10" + EXTRA_CONDA_PACKAGES: "numpy=1.23" + macos_py37_no_numpy: + imageName: "macos-latest" + PYTHON_VERSION: "3.7" variables: JUNITXML: 'test-data.xml' diff --git a/benchmarks/bench_pickle.py b/benchmarks/bench_pickle.py index 98f0bff26..7de096ef5 100755 --- a/benchmarks/bench_pickle.py +++ b/benchmarks/bench_pickle.py @@ -1,7 +1,7 @@ """ Benching joblib pickle I/O. -Warning: this is slow, and the benchs are easily offset by other disk +Warning: this is slow, and the benches are easily offset by other disk activity. """ import os diff --git a/conftest.py b/conftest.py index e246e951f..875e9b9b9 100644 --- a/conftest.py +++ b/conftest.py @@ -1,10 +1,10 @@ -from distutils.version import LooseVersion import pytest from _pytest.doctest import DoctestItem import logging from joblib.parallel import mp +from joblib.backports import LooseVersion try: import lz4 except ImportError: diff --git a/continuous_integration/build_doc.sh b/continuous_integration/build_doc.sh deleted file mode 100755 index 0c1e6e206..000000000 --- a/continuous_integration/build_doc.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash -set -x -set -e - -# deactivate circleci virtualenv and setup a miniconda env instead -if [[ `type -t deactivate` ]]; then - deactivate -fi - -wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - -O miniconda.sh -chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH -export PATH="$MINICONDA_PATH/bin:$PATH" -conda update --yes --quiet conda - -conda create -n $CONDA_ENV_NAME --yes --quiet python=3 -source activate $CONDA_ENV_NAME - -conda install --yes --quiet pip numpy sphinx matplotlib pillow dask distributed -pip install sphinx-gallery numpydoc lz4 - -python setup.py develop - -make doc 2>&1 diff --git a/continuous_integration/install.sh b/continuous_integration/install.sh index d191e1467..41e21e77c 100755 --- a/continuous_integration/install.sh +++ b/continuous_integration/install.sh @@ -13,17 +13,17 @@ set -e create_new_conda_env() { conda update --yes conda TO_INSTALL="python=$PYTHON_VERSION pip pytest $EXTRA_CONDA_PACKAGES" - conda create -n testenv --yes $TO_INSTALL + conda create -n testenv --yes -c conda-forge $TO_INSTALL source activate testenv } create_new_pypy3_env() { - PYPY_FOLDER="pypy3.6-v7.3.1-linux64" + PYPY_FOLDER="pypy3.7-v7.3.7-linux64" wget https://downloads.python.org/pypy/$PYPY_FOLDER.tar.bz2 tar xvf $PYPY_FOLDER.tar.bz2 $PYPY_FOLDER/bin/pypy3 -m venv pypy3 source pypy3/bin/activate - pip install -U pip pytest + pip install -U pip 'pytest' } if [[ "$PYTHON_VERSION" == "pypy3" ]]; then @@ -47,10 +47,7 @@ if [ -n "$NUMPY_VERSION" ]; then fi if [[ "$COVERAGE" == "true" ]]; then - # TODO: unpin when https://github.com/nedbat/coveragepy/issues/883 is fixed - # Weird issues with recent version of coverage: unpin when not causing - # pytest to raise INTERNALERROR exceptions. - PIP_INSTALL_PACKAGES="$PIP_INSTALL_PACKAGES coverage==4.5.4 pytest-cov codecov" + PIP_INSTALL_PACKAGES="$PIP_INSTALL_PACKAGES coverage pytest-cov codecov" fi if [[ "pypy3" != *"$PYTHON_VERSION"* ]]; then diff --git a/continuous_integration/run_tests.sh b/continuous_integration/run_tests.sh index 51bed5fb0..68d233fdc 100755 --- a/continuous_integration/run_tests.sh +++ b/continuous_integration/run_tests.sh @@ -20,27 +20,46 @@ if [[ "$SKIP_TESTS" != "true" ]]; then export PYTEST_ADDOPTS="--cov=joblib --cov-append" fi - pytest joblib -vl --timeout=60 --junitxml="${JUNITXML}" + pytest joblib -vl --timeout=120 --junitxml="${JUNITXML}" make test-doc fi if [[ "$SKLEARN_TESTS" == "true" ]]; then - # Install scikit-learn from conda and test against the installed + # Install the nightly build of scikit-learn and test against the installed # development version of joblib. - conda remove -y numpy - conda install -y -c conda-forge cython pillow scikit-learn + # TODO: unpin pip once either https://github.com/pypa/pip/issues/10825 + # accepts invalid HTML or Anaconda is fixed. + conda install -y -c conda-forge cython pillow numpy scipy "pip<22" + pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn python -c "import sklearn; print('Testing scikit-learn', sklearn.__version__)" # Move to a dedicated folder to avoid being polluted by joblib specific conftest.py # and disable the doctest plugin to avoid issues with doctests in scikit-learn # docstrings that require setting print_changed_only=True temporarily. - cd "/tmp" - pytest -vl --maxfail=5 -p no:doctest -k "not test_import_is_deprecated" --pyargs sklearn + NEW_TEST_DIR=$(mktemp -d) + cd $NEW_TEST_DIR + + pytest -vl --maxfail=5 -p no:doctest \ + -k "not test_import_is_deprecated" \ + -k "not test_check_memory" \ + --pyargs sklearn + + # Justification for skipping some tests: + # + # test_import_is_deprecated: Don't worry about deprecated imports: this is + # tested for real in upstream scikit-learn and this is not joblib's + # responsibility. Let's skip this test to avoid false positives in joblib's + # CI. + # + # test_check_memory: scikit-learn test need to be updated to avoid using + # cachedir: https://github.com/scikit-learn/scikit-learn/pull/22365 fi if [[ "$SKIP_TESTS" != "true" && "$COVERAGE" == "true" ]]; then echo "Deleting empty coverage files:" - find . -name ".coverage.*" -size 0 -print -delete + # the "|| echo" is to avoid having 0 return states that terminate the + # script when the find uncounters permission denied + find . -name ".coverage.*" -size 0 -print -delete || echo echo "Combining .coverage.* files..." coverage combine --append || echo "Found invalid coverage files." echo "Generating XML Coverage report..." diff --git a/doc/conf.py b/doc/conf.py index fa9e9289c..ae75b69d1 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -69,7 +69,7 @@ # General information about the project. project = 'joblib' -copyright = '2008-2018, Joblib developers' +copyright = '2008-2021, Joblib developers' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -253,7 +253,7 @@ shutil.copyfile('../README.rst', 'README.rst') except IOError: pass - # This fails during the tesing, as the code is ran in a different + # This fails during the testing, as the code is ran in a different # directory numpydoc_show_class_members = False diff --git a/doc/memory.rst b/doc/memory.rst index 0b9fbaef7..d539f5f61 100644 --- a/doc/memory.rst +++ b/doc/memory.rst @@ -145,7 +145,7 @@ arrays:: >>> cachedir2 = 'your_cachedir2_location' >>> memory2 = Memory(cachedir2, mmap_mode='r') >>> square = memory2.cache(np.square) - >>> a = np.vander(np.arange(3)).astype(np.float) + >>> a = np.vander(np.arange(3)).astype(float) >>> square(a) ________________________________________________________________________________ [Memory] Calling square... @@ -391,8 +391,8 @@ Gotchas ``joblib.Memory`` cache can get invalidated when upgrading ``joblib``. Invalidation can also happen when upgrading a third party library (such as ``numpy``): in such a case, only the cached function calls with parameters - that are constructs (or contain references to contructs) defined in the - upgraded library should potentially be invalidated after the uprade. + that are constructs (or contain references to constructs) defined in the + upgraded library should potentially be invalidated after the upgrade. Ignoring some arguments @@ -427,7 +427,7 @@ objects that, in addition of behaving like normal functions, expose methods useful for cache exploration and management. .. autoclass:: MemorizedFunc - :members: __init__, call, clear + :members: __init__, call, clear, check_call_in_cache .. diff --git a/doc/parallel.rst b/doc/parallel.rst index 466d613af..c4d3dd35a 100644 --- a/doc/parallel.rst +++ b/doc/parallel.rst @@ -69,7 +69,14 @@ In prior versions, the same effect could be achieved by hardcoding a specific backend implementation such as ``backend="threading"`` in the call to :class:`joblib.Parallel` but this is now considered a bad pattern (when done in a library) as it does not make it possible to override that -choice with the ``parallel_backend`` context manager. +choice with the :func:`~joblib.parallel_backend` context manager. + + +.. topic:: The loky backend may not always be available + + Some rare systems do not support multiprocessing (for instance + Pyodide). In this case the loky backend is not available and the + default backend falls back to threading. Besides builtin joblib backends, we can use `Joblib Apache Spark Backend `_ @@ -192,7 +199,7 @@ libraries: Since joblib 0.14, it is also possible to programmatically override the default number of threads using the ``inner_max_num_threads`` argument of the -``parallel_backend`` function as follows: +:func:`~joblib.parallel_backend` function as follows: .. code-block:: python diff --git a/examples/compressors_comparison.py b/examples/compressors_comparison.py index 64ebcf58d..3b20b10e5 100644 --- a/examples/compressors_comparison.py +++ b/examples/compressors_comparison.py @@ -8,7 +8,7 @@ GZip compression methods. For each compared compression method, this example dumps and reloads a dataset fetched from an online machine-learning database. This gives 3 -informations: the size on disk of the compressed data, the time spent to dump +information: the size on disk of the compressed data, the time spent to dump and the time spent to reload the data from disk. """ diff --git a/examples/serialization_and_wrappers.py b/examples/serialization_and_wrappers.py index 8e7ccf9eb..d03f0123e 100644 --- a/examples/serialization_and_wrappers.py +++ b/examples/serialization_and_wrappers.py @@ -36,7 +36,7 @@ def func_async(i, *args): ############################################################################### -# For most use-cases, using ``cloudpickle``` is efficient enough. However, this +# For most use-cases, using ``cloudpickle`` is efficient enough. However, this # solution can be very slow to serialize large python objects, such as dict or # list, compared to the standard ``pickle`` serialization. # @@ -78,7 +78,7 @@ def func_async(i, *args): # POSIX specification and can have bad interaction with compiled extensions # that use ``openmp``. Also, it is not possible to start processes with # ``fork`` on windows where only ``spawn`` is available. The ``loky`` backend -# has been developped to mitigate these issues. +# has been developed to mitigate these issues. # # To have fast pickling with ``loky``, it is possible to rely on ``pickle`` to # serialize all communications between the main process and the workers with diff --git a/joblib/__init__.py b/joblib/__init__.py index 9594f4c52..cb124c452 100644 --- a/joblib/__init__.py +++ b/joblib/__init__.py @@ -59,7 +59,7 @@ >>> cachedir = 'your_cache_dir_goes_here' >>> mem = Memory(cachedir) >>> import numpy as np - >>> a = np.vander(np.arange(3)).astype(np.float) + >>> a = np.vander(np.arange(3)).astype(float) >>> square = mem.cache(np.square) >>> b = square(a) # doctest: +ELLIPSIS ________________________________________________________________________________ @@ -106,7 +106,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '1.0.1' +__version__ = '1.2.0' import os @@ -123,8 +123,7 @@ from .parallel import register_parallel_backend from .parallel import parallel_backend from .parallel import effective_n_jobs - -from .externals.loky import wrap_non_picklable_objects +from ._cloudpickle_wrapper import wrap_non_picklable_objects __all__ = ['Memory', 'MemorizedResult', 'PrintTime', 'Logger', 'hash', 'dump', diff --git a/joblib/_cloudpickle_wrapper.py b/joblib/_cloudpickle_wrapper.py new file mode 100644 index 000000000..3dbe3ae71 --- /dev/null +++ b/joblib/_cloudpickle_wrapper.py @@ -0,0 +1,17 @@ +""" +Small shim of loky's cloudpickle_wrapper to avoid failure when +multiprocessing is not available. +""" + + +from ._multiprocessing_helpers import mp + + +def my_wrap_non_picklable_objects(obj, keep_wrapper=True): + return obj + + +if mp is None: + wrap_non_picklable_objects = my_wrap_non_picklable_objects +else: + from .externals.loky import wrap_non_picklable_objects # noqa diff --git a/joblib/_dask.py b/joblib/_dask.py index 009ddc6b1..57e247e91 100644 --- a/joblib/_dask.py +++ b/joblib/_dask.py @@ -12,11 +12,13 @@ from .parallel import parallel_backend try: + import dask import distributed except ImportError: + dask = None distributed = None -if distributed is not None: +if dask is not None and distributed is not None: from dask.utils import funcname, itemgetter from dask.sizeof import sizeof from dask.distributed import ( @@ -24,10 +26,12 @@ as_completed, get_client, secede, - rejoin + rejoin, + get_worker ) from distributed.utils import thread_state + try: # asyncio.TimeoutError, Python3-only error thrown by recent versions of # distributed @@ -51,7 +55,7 @@ class _WeakKeyDictionary: such as large numpy arrays or pandas dataframes that are not hashable and therefore cannot be used as keys of traditional python dicts. - Futhermore using a dict with id(array) as key is not safe because the + Furthermore using a dict with id(array) as key is not safe because the Python is likely to reuse id of recently collected arrays. """ diff --git a/joblib/_memmapping_reducer.py b/joblib/_memmapping_reducer.py index d58382222..9d350c032 100644 --- a/joblib/_memmapping_reducer.py +++ b/joblib/_memmapping_reducer.py @@ -99,6 +99,10 @@ def unlink_file(filename): raise else: time.sleep(.2) + except FileNotFoundError: + # In case of a race condition when deleting the temporary folder, + # avoid noisy FileNotFoundError exception in the resource tracker. + pass resource_tracker._CLEANUP_FUNCS['file'] = unlink_file diff --git a/joblib/_multiprocessing_helpers.py b/joblib/_multiprocessing_helpers.py index 1c5de2f8b..bde4bc190 100644 --- a/joblib/_multiprocessing_helpers.py +++ b/joblib/_multiprocessing_helpers.py @@ -14,6 +14,7 @@ if mp: try: import multiprocessing as mp + import _multiprocessing # noqa except ImportError: mp = None diff --git a/joblib/_parallel_backends.py b/joblib/_parallel_backends.py index 42645285d..c6ec537e9 100644 --- a/joblib/_parallel_backends.py +++ b/joblib/_parallel_backends.py @@ -431,10 +431,22 @@ def effective_n_jobs(self, n_jobs): if mp.current_process().daemon: # Daemonic processes cannot have children if n_jobs != 1: - warnings.warn( - 'Multiprocessing-backed parallel loops cannot be nested,' - ' setting n_jobs=1', - stacklevel=3) + if inside_dask_worker(): + msg = ( + "Inside a Dask worker with daemon=True, " + "setting n_jobs=1.\nPossible work-arounds:\n" + "- dask.config.set(" + "{'distributed.worker.daemon': False})" + "- set the environment variable " + "DASK_DISTRIBUTED__WORKER__DAEMON=False\n" + "before creating your Dask cluster." + ) + else: + msg = ( + 'Multiprocessing-backed parallel loops ' + 'cannot be nested, setting n_jobs=1' + ) + warnings.warn(msg, stacklevel=3) return 1 if process_executor._CURRENT_DEPTH > 0: @@ -509,10 +521,23 @@ def effective_n_jobs(self, n_jobs): elif mp.current_process().daemon: # Daemonic processes cannot have children if n_jobs != 1: - warnings.warn( - 'Loky-backed parallel loops cannot be called in a' - ' multiprocessing, setting n_jobs=1', - stacklevel=3) + if inside_dask_worker(): + msg = ( + "Inside a Dask worker with daemon=True, " + "setting n_jobs=1.\nPossible work-arounds:\n" + "- dask.config.set(" + "{'distributed.worker.daemon': False})\n" + "- set the environment variable " + "DASK_DISTRIBUTED__WORKER__DAEMON=False\n" + "before creating your Dask cluster." + ) + else: + msg = ( + 'Loky-backed parallel loops cannot be called in a' + ' multiprocessing, setting n_jobs=1' + ) + warnings.warn(msg, stacklevel=3) + return 1 elif not (self.in_main_thread() or self.nesting_level == 0): # Prevent posix fork inside in non-main posix threads @@ -608,3 +633,21 @@ class FallbackToBackend(Exception): def __init__(self, backend): self.backend = backend + + +def inside_dask_worker(): + """Check whether the current function is executed inside a Dask worker. + """ + # This function can not be in joblib._dask because there would be a + # circular import: + # _dask imports _parallel_backend that imports _dask ... + try: + from distributed import get_worker + except ImportError: + return False + + try: + get_worker() + return True + except ValueError: + return False diff --git a/joblib/_store_backends.py b/joblib/_store_backends.py index d4389ed86..e96f30610 100644 --- a/joblib/_store_backends.py +++ b/joblib/_store_backends.py @@ -130,7 +130,7 @@ def configure(self, location, verbose=0, backend_options=dict()): verbose: int The level of verbosity of the store backend_options: dict - Contains a dictionnary of named paremeters used to configure the + Contains a dictionary of named parameters used to configure the store backend. """ diff --git a/joblib/_utils.py b/joblib/_utils.py new file mode 100644 index 000000000..2dbd4f636 --- /dev/null +++ b/joblib/_utils.py @@ -0,0 +1,44 @@ +# Adapted from https://stackoverflow.com/a/9558001/2536294 + +import ast +import operator as op + +# supported operators +operators = { + ast.Add: op.add, + ast.Sub: op.sub, + ast.Mult: op.mul, + ast.Div: op.truediv, + ast.FloorDiv: op.floordiv, + ast.Mod: op.mod, + ast.Pow: op.pow, + ast.USub: op.neg, +} + + +def eval_expr(expr): + """ + >>> eval_expr('2*6') + 12 + >>> eval_expr('2**6') + 64 + >>> eval_expr('1 + 2*3**(4) / (6 + -7)') + -161.0 + """ + try: + return eval_(ast.parse(expr, mode="eval").body) + except (TypeError, SyntaxError, KeyError) as e: + raise ValueError( + f"{expr!r} is not a valid or supported arithmetic expression." + ) from e + + +def eval_(node): + if isinstance(node, ast.Num): # + return node.n + elif isinstance(node, ast.BinOp): # + return operators[type(node.op)](eval_(node.left), eval_(node.right)) + elif isinstance(node, ast.UnaryOp): # e.g., -1 + return operators[type(node.op)](eval_(node.operand)) + else: + raise TypeError(node) diff --git a/joblib/backports.py b/joblib/backports.py index cb2f7233d..c9936faae 100644 --- a/joblib/backports.py +++ b/joblib/backports.py @@ -2,12 +2,124 @@ Backports of fixes for joblib dependencies """ import os +import re import time -from distutils.version import LooseVersion from os.path import basename from multiprocessing import util +# Prior to joblib 1.2, joblib used to import LooseVersion from +# distutils.version. This import had a side-effect with setuptools that was +# implicitly required in sklearn.show_versions() to work without raising an +# exception for scikit-learn 1.0 and earlier. This has been fixed in +# scikit-learn 1.1 (not yet released at the time of writing), see: +# https://github.com/scikit-learn/scikit-learn/issues/22614 +# +# To avoid unnecessary disruption for users who might update to joblib 1.2 +# prior to a release of scikit-learn that includes the fix, let's keep on +# importing distutils here. TODO: Remove this for a future release of joblib, +# e.g. 6 months after the release of scikit-learn 1.1. +import distutils # noqa + + +class Version: + """Backport from deprecated distutils + + We maintain this backport to avoid introducing a new dependency on + `packaging`. + + We might rexplore this choice in the future if all major Python projects + introduce a dependency on packaging anyway. + """ + + def __init__(self, vstring=None): + if vstring: + self.parse(vstring) + + def __repr__(self): + return "%s ('%s')" % (self.__class__.__name__, str(self)) + + def __eq__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c == 0 + + def __lt__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c < 0 + + def __le__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c <= 0 + + def __gt__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c > 0 + + def __ge__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c >= 0 + + +class LooseVersion(Version): + """Backport from deprecated distutils + + We maintain this backport to avoid introducing a new dependency on + `packaging`. + + We might rexplore this choice in the future if all major Python projects + introduce a dependency on packaging anyway. + """ + + component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE) + + def __init__(self, vstring=None): + if vstring: + self.parse(vstring) + + def parse(self, vstring): + # I've given up on thinking I can reconstruct the version string + # from the parsed tuple -- so I just store the string here for + # use by __str__ + self.vstring = vstring + components = [x for x in self.component_re.split(vstring) + if x and x != '.'] + for i, obj in enumerate(components): + try: + components[i] = int(obj) + except ValueError: + pass + + self.version = components + + def __str__(self): + return self.vstring + + def __repr__(self): + return "LooseVersion ('%s')" % str(self) + + def _cmp(self, other): + if isinstance(other, str): + other = LooseVersion(other) + elif not isinstance(other, LooseVersion): + return NotImplemented + + if self.version == other.version: + return 0 + if self.version < other.version: + return -1 + if self.version > other.version: + return 1 + try: import numpy as np diff --git a/joblib/compressor.py b/joblib/compressor.py index 0dbd3dc93..8361d37d4 100644 --- a/joblib/compressor.py +++ b/joblib/compressor.py @@ -2,7 +2,7 @@ import io import zlib -from distutils.version import LooseVersion +from joblib.backports import LooseVersion try: from threading import RLock @@ -89,7 +89,7 @@ class CompressorWrapper(): prefix: bytestring A bytestring corresponding to the magic number that identifies the file format associated to the compressor. - extention: str + extension: str The file extension used to automatically select this compressor during a dump to a file. """ diff --git a/joblib/disk.py b/joblib/disk.py index 3b2735d04..32fbb89f6 100644 --- a/joblib/disk.py +++ b/joblib/disk.py @@ -66,10 +66,10 @@ def mkdirp(d): # if a rmtree operation fails in rm_subdirs, wait for this much time (in secs), # then retry up to RM_SUBDIRS_N_RETRY times. If it still fails, raise the -# exception. this mecanism ensures that the sub-process gc have the time to +# exception. this mechanism ensures that the sub-process gc have the time to # collect and close the memmaps before we fail. RM_SUBDIRS_RETRY_TIME = 0.1 -RM_SUBDIRS_N_RETRY = 5 +RM_SUBDIRS_N_RETRY = 10 def rm_subdirs(path, onerror=None): @@ -119,7 +119,7 @@ def delete_folder(folder_path, onerror=None, allow_non_empty=True): folder_path, ignore_errors=False, onerror=None ) util.debug( - "Sucessfully deleted {}".format(folder_path)) + "Successfully deleted {}".format(folder_path)) break else: raise OSError( diff --git a/joblib/externals/cloudpickle/__init__.py b/joblib/externals/cloudpickle/__init__.py index f461d65e9..c802221ef 100644 --- a/joblib/externals/cloudpickle/__init__.py +++ b/joblib/externals/cloudpickle/__init__.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import - - from .cloudpickle import * # noqa from .cloudpickle_fast import CloudPickler, dumps, dump # noqa @@ -8,4 +5,4 @@ # expose their Pickler subclass at top-level under the "Pickler" name. Pickler = CloudPickler -__version__ = '1.6.0' +__version__ = '2.2.0' diff --git a/joblib/externals/cloudpickle/cloudpickle.py b/joblib/externals/cloudpickle/cloudpickle.py index 05d52afa0..317be6915 100644 --- a/joblib/externals/cloudpickle/cloudpickle.py +++ b/joblib/externals/cloudpickle/cloudpickle.py @@ -40,7 +40,6 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ -from __future__ import print_function import builtins import dis @@ -55,7 +54,8 @@ import warnings from .compat import pickle -from typing import Generic, Union, Tuple, Callable +from collections import OrderedDict +from typing import ClassVar, Generic, Union, Tuple, Callable from pickle import _getattribute from importlib._bootstrap import _find_spec @@ -65,11 +65,6 @@ except ImportError: _typing_extensions = Literal = Final = None -if sys.version_info >= (3, 5, 3): - from typing import ClassVar -else: # pragma: no cover - ClassVar = None - if sys.version_info >= (3, 8): from types import CellType else: @@ -87,8 +82,11 @@ def g(): # communication speed over compatibility: DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL +# Names of modules whose resources should be treated as dynamic. +_PICKLE_BY_VALUE_MODULES = set() + # Track the provenance of reconstructed dynamic classes to make it possible to -# recontruct instances from the matching singleton class definition when +# reconstruct instances from the matching singleton class definition when # appropriate and preserve the usual "isinstance" semantics of Python objects. _DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary() _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() @@ -123,6 +121,77 @@ def _lookup_class_or_track(class_tracker_id, class_def): return class_def +def register_pickle_by_value(module): + """Register a module to make it functions and classes picklable by value. + + By default, functions and classes that are attributes of an importable + module are to be pickled by reference, that is relying on re-importing + the attribute from the module at load time. + + If `register_pickle_by_value(module)` is called, all its functions and + classes are subsequently to be pickled by value, meaning that they can + be loaded in Python processes where the module is not importable. + + This is especially useful when developing a module in a distributed + execution environment: restarting the client Python process with the new + source code is enough: there is no need to re-install the new version + of the module on all the worker nodes nor to restart the workers. + + Note: this feature is considered experimental. See the cloudpickle + README.md file for more details and limitations. + """ + if not isinstance(module, types.ModuleType): + raise ValueError( + f"Input should be a module object, got {str(module)} instead" + ) + # In the future, cloudpickle may need a way to access any module registered + # for pickling by value in order to introspect relative imports inside + # functions pickled by value. (see + # https://github.com/cloudpipe/cloudpickle/pull/417#issuecomment-873684633). + # This access can be ensured by checking that module is present in + # sys.modules at registering time and assuming that it will still be in + # there when accessed during pickling. Another alternative would be to + # store a weakref to the module. Even though cloudpickle does not implement + # this introspection yet, in order to avoid a possible breaking change + # later, we still enforce the presence of module inside sys.modules. + if module.__name__ not in sys.modules: + raise ValueError( + f"{module} was not imported correctly, have you used an " + f"`import` statement to access it?" + ) + _PICKLE_BY_VALUE_MODULES.add(module.__name__) + + +def unregister_pickle_by_value(module): + """Unregister that the input module should be pickled by value.""" + if not isinstance(module, types.ModuleType): + raise ValueError( + f"Input should be a module object, got {str(module)} instead" + ) + if module.__name__ not in _PICKLE_BY_VALUE_MODULES: + raise ValueError(f"{module} is not registered for pickle by value") + else: + _PICKLE_BY_VALUE_MODULES.remove(module.__name__) + + +def list_registry_pickle_by_value(): + return _PICKLE_BY_VALUE_MODULES.copy() + + +def _is_registered_pickle_by_value(module): + module_name = module.__name__ + if module_name in _PICKLE_BY_VALUE_MODULES: + return True + while True: + parent_name = module_name.rsplit(".", 1)[0] + if parent_name == module_name: + break + if parent_name in _PICKLE_BY_VALUE_MODULES: + return True + module_name = parent_name + return False + + def _whichmodule(obj, name): """Find the module an object belongs to. @@ -136,11 +205,14 @@ def _whichmodule(obj, name): # Workaround bug in old Python versions: prior to Python 3.7, # T.__module__ would always be set to "typing" even when the TypeVar T # would be defined in a different module. - # - # For such older Python versions, we ignore the __module__ attribute of - # TypeVar instances and instead exhaustively lookup those instances in - # all currently imported modules. - module_name = None + if name is not None and getattr(typing, name, None) is obj: + # Built-in TypeVar defined in typing such as AnyStr + return 'typing' + else: + # User defined or third-party TypeVar: __module__ attribute is + # irrelevant, thus trigger a exhaustive search for obj in all + # modules. + module_name = None else: module_name = getattr(obj, '__module__', None) @@ -166,18 +238,35 @@ def _whichmodule(obj, name): return None -def _is_importable(obj, name=None): - """Dispatcher utility to test the importability of various constructs.""" - if isinstance(obj, types.FunctionType): - return _lookup_module_and_qualname(obj, name=name) is not None - elif issubclass(type(obj), type): - return _lookup_module_and_qualname(obj, name=name) is not None +def _should_pickle_by_reference(obj, name=None): + """Test whether an function or a class should be pickled by reference + + Pickling by reference means by that the object (typically a function or a + class) is an attribute of a module that is assumed to be importable in the + target Python environment. Loading will therefore rely on importing the + module and then calling `getattr` on it to access the function or class. + + Pickling by reference is the only option to pickle functions and classes + in the standard library. In cloudpickle the alternative option is to + pickle by value (for instance for interactively or locally defined + functions and classes or for attributes of modules that have been + explicitly registered to be pickled by value. + """ + if isinstance(obj, types.FunctionType) or issubclass(type(obj), type): + module_and_name = _lookup_module_and_qualname(obj, name=name) + if module_and_name is None: + return False + module, name = module_and_name + return not _is_registered_pickle_by_value(module) + elif isinstance(obj, types.ModuleType): # We assume that sys.modules is primarily used as a cache mechanism for # the Python import machinery. Checking if a module has been added in - # is sys.modules therefore a cheap and simple heuristic to tell us whether - # we can assume that a given module could be imported by name in - # another Python process. + # is sys.modules therefore a cheap and simple heuristic to tell us + # whether we can assume that a given module could be imported by name + # in another Python process. + if _is_registered_pickle_by_value(obj): + return False return obj.__name__ in sys.modules else: raise TypeError( @@ -232,11 +321,13 @@ def _extract_code_globals(co): """ out_names = _extract_code_globals_cache.get(co) if out_names is None: - names = co.co_names - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} + # We use a dict with None values instead of a set to get a + # deterministic order (assuming Python 3.6+) and avoid introducing + # non-deterministic pickle bytes as a results. + out_names = {name: None for name in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresonding to the one + # syntax generates a constant code object corresponding to the one # of the nested function's As the nested function may itself need # global variables, we need to introspect its code, extract its # globals, (look for code object in it's co_consts attribute..) and @@ -244,7 +335,7 @@ def _extract_code_globals(co): if co.co_consts: for const in co.co_consts: if isinstance(const, types.CodeType): - out_names |= _extract_code_globals(const) + out_names.update(_extract_code_globals(const)) _extract_code_globals_cache[co] = out_names @@ -419,13 +510,12 @@ def _builtin_type(name): def _walk_global_ops(code): """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. + Yield referenced name for all global-referencing instructions in *code*. """ for instr in dis.get_instructions(code): op = instr.opcode if op in GLOBAL_OPS: - yield op, instr.arg + yield instr.argval def _extract_class_dict(cls): @@ -452,15 +542,31 @@ def _extract_class_dict(cls): if sys.version_info[:2] < (3, 7): # pragma: no branch def _is_parametrized_type_hint(obj): - # This is very cheap but might generate false positives. + # This is very cheap but might generate false positives. So try to + # narrow it down is good as possible. + type_module = getattr(type(obj), '__module__', None) + from_typing_extensions = type_module == 'typing_extensions' + from_typing = type_module == 'typing' + # general typing Constructs is_typing = getattr(obj, '__origin__', None) is not None # typing_extensions.Literal - is_litteral = getattr(obj, '__values__', None) is not None + is_literal = ( + (getattr(obj, '__values__', None) is not None) + and from_typing_extensions + ) # typing_extensions.Final - is_final = getattr(obj, '__type__', None) is not None + is_final = ( + (getattr(obj, '__type__', None) is not None) + and from_typing_extensions + ) + + # typing.ClassVar + is_classvar = ( + (getattr(obj, '__type__', None) is not None) and from_typing + ) # typing.Union/Tuple for old Python 3.5 is_union = getattr(obj, '__union_params__', None) is not None @@ -469,8 +575,8 @@ def _is_parametrized_type_hint(obj): getattr(obj, '__result__', None) is not None and getattr(obj, '__args__', None) is not None ) - return any((is_typing, is_litteral, is_final, is_union, is_tuple, - is_callable)) + return any((is_typing, is_literal, is_final, is_classvar, is_union, + is_tuple, is_callable)) def _create_parametrized_type_hint(origin, args): return origin[args] @@ -490,43 +596,21 @@ def parametrized_type_hint_getinitargs(obj): elif type(obj) is type(ClassVar): initargs = (ClassVar, obj.__type__) elif type(obj) is type(Generic): - parameters = obj.__parameters__ - if len(obj.__parameters__) > 0: - # in early Python 3.5, __parameters__ was sometimes - # preferred to __args__ - initargs = (obj.__origin__, parameters) - - else: - initargs = (obj.__origin__, obj.__args__) + initargs = (obj.__origin__, obj.__args__) elif type(obj) is type(Union): - if sys.version_info < (3, 5, 3): # pragma: no cover - initargs = (Union, obj.__union_params__) - else: - initargs = (Union, obj.__args__) + initargs = (Union, obj.__args__) elif type(obj) is type(Tuple): - if sys.version_info < (3, 5, 3): # pragma: no cover - initargs = (Tuple, obj.__tuple_params__) - else: - initargs = (Tuple, obj.__args__) + initargs = (Tuple, obj.__args__) elif type(obj) is type(Callable): - if sys.version_info < (3, 5, 3): # pragma: no cover - args = obj.__args__ - result = obj.__result__ - if args != Ellipsis: - if isinstance(args, tuple): - args = list(args) - else: - args = [args] + (*args, result) = obj.__args__ + if len(args) == 1 and args[0] is Ellipsis: + args = Ellipsis else: - (*args, result) = obj.__args__ - if len(args) == 1 and args[0] is Ellipsis: - args = Ellipsis - else: - args = list(args) + args = list(args) initargs = (Callable, (args, result)) else: # pragma: no cover raise pickle.PicklingError( - "Cloudpickle Error: Unknown type {}".format(type(obj)) + f"Cloudpickle Error: Unknown type {type(obj)}" ) return initargs @@ -557,8 +641,11 @@ def _rebuild_tornado_coroutine(func): loads = pickle.loads -# hack for __import__ not working as desired def subimport(name): + # We cannot do simply: `return __import__(name)`: Indeed, if ``name`` is + # the name of a submodule, __import__ will return the top-level root module + # of this submodule. For instance, __import__('os.path') returns the `os` + # module. __import__(name) return sys.modules[name] @@ -603,7 +690,7 @@ def instance(cls): @instance -class _empty_cell_value(object): +class _empty_cell_value: """sentinel for empty closures """ @classmethod @@ -632,7 +719,7 @@ def _fill_function(*args): keys = ['globals', 'defaults', 'dict', 'module', 'closure_values'] state = dict(zip(keys, args[1:])) else: - raise ValueError('Unexpected _fill_value arguments: %r' % (args,)) + raise ValueError(f'Unexpected _fill_value arguments: {args!r}') # - At pickling time, any dynamic global variable used by func is # serialized by value (in state['globals']). @@ -676,6 +763,12 @@ def _fill_function(*args): return func +def _make_function(code, globals, name, argdefs, closure): + # Setting __builtins__ in globals is needed for nogil CPython. + globals["__builtins__"] = __builtins__ + return types.FunctionType(code, globals, name, argdefs, closure) + + def _make_empty_cell(): if False: # trick the compiler into creating an empty cell in our lambda @@ -699,7 +792,7 @@ def _make_skel_func(code, cell_count, base_globals=None): """ # This function is deprecated and should be removed in cloudpickle 1.7 warnings.warn( - "A pickle file created using an old (<=1.4.1) version of cloudpicke " + "A pickle file created using an old (<=1.4.1) version of cloudpickle " "is currently being loaded. This is not supported by cloudpickle and " "will break in cloudpickle 1.7", category=UserWarning ) @@ -800,29 +893,33 @@ def _make_typevar(name, bound, constraints, covariant, contravariant, def _decompose_typevar(obj): - try: - class_tracker_id = _get_or_create_tracker_id(obj) - except TypeError: # pragma: nocover - # TypeVar instances are not weakref-able in Python 3.5.3 - class_tracker_id = None return ( obj.__name__, obj.__bound__, obj.__constraints__, obj.__covariant__, obj.__contravariant__, - class_tracker_id, + _get_or_create_tracker_id(obj), ) def _typevar_reduce(obj): - # TypeVar instances have no __qualname__ hence we pass the name explicitly. + # TypeVar instances require the module information hence why we + # are not using the _should_pickle_by_reference directly module_and_name = _lookup_module_and_qualname(obj, name=obj.__name__) + if module_and_name is None: return (_make_typevar, _decompose_typevar(obj)) + elif _is_registered_pickle_by_value(module_and_name[0]): + return (_make_typevar, _decompose_typevar(obj)) + return (getattr, module_and_name) def _get_bases(typ): - if hasattr(typ, '__orig_bases__'): + if '__orig_bases__' in getattr(typ, '__dict__', {}): # For generic types (see PEP 560) + # Note that simply checking `hasattr(typ, '__orig_bases__')` is not + # correct. Subclasses of a fully-parameterized generic class does not + # have `__orig_bases__` defined, but `hasattr(typ, '__orig_bases__')` + # will return True because it's defined in the base class. bases_attr = '__orig_bases__' else: # For regular class objects @@ -830,13 +927,22 @@ def _get_bases(typ): return getattr(typ, bases_attr) -def _make_dict_keys(obj): - return dict.fromkeys(obj).keys() +def _make_dict_keys(obj, is_ordered=False): + if is_ordered: + return OrderedDict.fromkeys(obj).keys() + else: + return dict.fromkeys(obj).keys() -def _make_dict_values(obj): - return {i: _ for i, _ in enumerate(obj)}.values() +def _make_dict_values(obj, is_ordered=False): + if is_ordered: + return OrderedDict((i, _) for i, _ in enumerate(obj)).values() + else: + return {i: _ for i, _ in enumerate(obj)}.values() -def _make_dict_items(obj): - return obj.items() +def _make_dict_items(obj, is_ordered=False): + if is_ordered: + return OrderedDict(obj).items() + else: + return obj.items() diff --git a/joblib/externals/cloudpickle/cloudpickle_fast.py b/joblib/externals/cloudpickle/cloudpickle_fast.py index fa8da0f63..8741dcbda 100644 --- a/joblib/externals/cloudpickle/cloudpickle_fast.py +++ b/joblib/externals/cloudpickle/cloudpickle_fast.py @@ -6,7 +6,7 @@ is only available for Python versions 3.8+, a lot of backward-compatibility code is also removed. -Note that the C Pickler sublassing API is CPython-specific. Therefore, some +Note that the C Pickler subclassing API is CPython-specific. Therefore, some guards present in cloudpickle.py that were written to handle PyPy specificities are not present in cloudpickle_fast.py """ @@ -23,23 +23,23 @@ import typing from enum import Enum -from collections import ChainMap +from collections import ChainMap, OrderedDict from .compat import pickle, Pickler from .cloudpickle import ( _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, - _find_imported_submodules, _get_cell_contents, _is_importable, + _find_imported_submodules, _get_cell_contents, _should_pickle_by_reference, _builtin_type, _get_or_create_tracker_id, _make_skeleton_class, _make_skeleton_enum, _extract_class_dict, dynamic_subimport, subimport, _typevar_reduce, _get_bases, _make_cell, _make_empty_cell, CellType, _is_parametrized_type_hint, PYPY, cell_set, parametrized_type_hint_getinitargs, _create_parametrized_type_hint, builtin_code_type, - _make_dict_keys, _make_dict_values, _make_dict_items, + _make_dict_keys, _make_dict_values, _make_dict_items, _make_function, ) -if pickle.HIGHEST_PROTOCOL >= 5 and not PYPY: +if pickle.HIGHEST_PROTOCOL >= 5: # Shorthands similar to pickle.dump/pickle.dumps def dump(obj, file, protocol=None, buffer_callback=None): @@ -123,7 +123,7 @@ def _class_getnewargs(obj): def _enum_getnewargs(obj): - members = dict((e.name, e.value) for e in obj) + members = {e.name: e.value for e in obj} return (obj.__bases__, obj.__name__, obj.__qualname__, members, obj.__module__, _get_or_create_tracker_id(obj), None) @@ -180,7 +180,7 @@ def _class_getstate(obj): clsdict.pop('__weakref__', None) if issubclass(type(obj), abc.ABCMeta): - # If obj is an instance of an ABCMeta subclass, dont pickle the + # If obj is an instance of an ABCMeta subclass, don't pickle the # cache/negative caches populated during isinstance/issubclass # checks, but pickle the list of registered subclasses of obj. clsdict.pop('_abc_cache', None) @@ -218,7 +218,7 @@ def _class_getstate(obj): def _enum_getstate(obj): clsdict, slotstate = _class_getstate(obj) - members = dict((e.name, e.value) for e in obj) + members = {e.name: e.value for e in obj} # Cleanup the clsdict that will be passed to _rehydrate_skeleton_class: # Those attributes are already handled by the metaclass. for attrname in ["_generate_next_value_", "_member_names_", @@ -244,7 +244,46 @@ def _enum_getstate(obj): def _code_reduce(obj): """codeobject reducer""" - if hasattr(obj, "co_posonlyargcount"): # pragma: no branch + # If you are not sure about the order of arguments, take a look at help + # of the specific type from types, for example: + # >>> from types import CodeType + # >>> help(CodeType) + if hasattr(obj, "co_exceptiontable"): # pragma: no branch + # Python 3.11 and later: there are some new attributes + # related to the enhanced exceptions. + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, obj.co_qualname, + obj.co_firstlineno, obj.co_linetable, obj.co_exceptiontable, + obj.co_freevars, obj.co_cellvars, + ) + elif hasattr(obj, "co_linetable"): # pragma: no branch + # Python 3.10 and later: obj.co_lnotab is deprecated and constructor + # expects obj.co_linetable instead. + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_linetable, obj.co_freevars, + obj.co_cellvars + ) + elif hasattr(obj, "co_nmeta"): # pragma: no cover + # "nogil" Python: modified attributes from 3.9 + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_framesize, + obj.co_ndefaultargs, obj.co_nmeta, + obj.co_flags, obj.co_code, obj.co_consts, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_exc_handlers, + obj.co_jump_table, obj.co_freevars, obj.co_cellvars, + obj.co_free2reg, obj.co_cell2reg + ) + elif hasattr(obj, "co_posonlyargcount"): + # Backward compat for 3.9 and older args = ( obj.co_argcount, obj.co_posonlyargcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, @@ -254,6 +293,7 @@ def _code_reduce(obj): obj.co_cellvars ) else: + # Backward compat for even older versions of Python args = ( obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, @@ -339,11 +379,16 @@ def _memoryview_reduce(obj): def _module_reduce(obj): - if _is_importable(obj): + if _should_pickle_by_reference(obj): return subimport, (obj.__name__,) else: - obj.__dict__.pop('__builtins__', None) - return dynamic_subimport, (obj.__name__, vars(obj)) + # Some external libraries can populate the "__builtins__" entry of a + # module's `__dict__` with unpicklable objects (see #316). For that + # reason, we do not attempt to pickle the "__builtins__" entry, and + # restore a default value for it at unpickling time. + state = obj.__dict__.copy() + state.pop('__builtins__', None) + return dynamic_subimport, (obj.__name__, state) def _method_reduce(obj): @@ -396,7 +441,7 @@ def _class_reduce(obj): return type, (NotImplemented,) elif obj in _BUILTIN_TYPE_NAMES: return _builtin_type, (_BUILTIN_TYPE_NAMES[obj],) - elif not _is_importable(obj): + elif not _should_pickle_by_reference(obj): return _dynamic_class_reduce(obj) return NotImplemented @@ -419,6 +464,24 @@ def _dict_items_reduce(obj): return _make_dict_items, (dict(obj), ) +def _odict_keys_reduce(obj): + # Safer not to ship the full dict as sending the rest might + # be unintended and could potentially cause leaking of + # sensitive information + return _make_dict_keys, (list(obj), True) + + +def _odict_values_reduce(obj): + # Safer not to ship the full dict as sending the rest might + # be unintended and could potentially cause leaking of + # sensitive information + return _make_dict_values, (list(obj), True) + + +def _odict_items_reduce(obj): + return _make_dict_items, (dict(obj), True) + + # COLLECTIONS OF OBJECTS STATE SETTERS # ------------------------------------ # state setters are called at unpickling time, once the object is created and @@ -426,7 +489,7 @@ def _dict_items_reduce(obj): def _function_setstate(obj, state): - """Update the state of a dynaamic function. + """Update the state of a dynamic function. As __closure__ and __globals__ are readonly attributes of a function, we cannot rely on the native setstate routine of pickle.load_build, that calls @@ -495,7 +558,13 @@ class CloudPickler(Pickler): _dispatch_table[_collections_abc.dict_keys] = _dict_keys_reduce _dispatch_table[_collections_abc.dict_values] = _dict_values_reduce _dispatch_table[_collections_abc.dict_items] = _dict_items_reduce - + _dispatch_table[type(OrderedDict().keys())] = _odict_keys_reduce + _dispatch_table[type(OrderedDict().values())] = _odict_values_reduce + _dispatch_table[type(OrderedDict().items())] = _odict_items_reduce + _dispatch_table[abc.abstractmethod] = _classmethod_reduce + _dispatch_table[abc.abstractclassmethod] = _classmethod_reduce + _dispatch_table[abc.abstractstaticmethod] = _classmethod_reduce + _dispatch_table[abc.abstractproperty] = _property_reduce dispatch_table = ChainMap(_dispatch_table, copyreg.dispatch_table) @@ -505,7 +574,7 @@ def _dynamic_function_reduce(self, func): """Reduce a function that is not pickleable via attribute lookup.""" newargs = self._function_getnewargs(func) state = _function_getstate(func) - return (types.FunctionType, newargs, state, None, None, + return (_make_function, newargs, state, None, None, _function_setstate) def _function_reduce(self, obj): @@ -520,7 +589,7 @@ def _function_reduce(self, obj): As opposed to cloudpickle.py, There no special handling for builtin pypy functions because cloudpickle_fast is CPython-specific. """ - if _is_importable(obj): + if _should_pickle_by_reference(obj): return NotImplemented else: return self._dynamic_function_reduce(obj) @@ -572,6 +641,32 @@ def dump(self, obj): raise if pickle.HIGHEST_PROTOCOL >= 5: + def __init__(self, file, protocol=None, buffer_callback=None): + if protocol is None: + protocol = DEFAULT_PROTOCOL + Pickler.__init__( + self, file, protocol=protocol, buffer_callback=buffer_callback + ) + # map functions __globals__ attribute ids, to ensure that functions + # sharing the same global namespace at pickling time also share + # their global namespace at unpickling time. + self.globals_ref = {} + self.proto = int(protocol) + else: + def __init__(self, file, protocol=None): + if protocol is None: + protocol = DEFAULT_PROTOCOL + Pickler.__init__(self, file, protocol=protocol) + # map functions __globals__ attribute ids, to ensure that functions + # sharing the same global namespace at pickling time also share + # their global namespace at unpickling time. + self.globals_ref = {} + assert hasattr(self, 'proto') + + if pickle.HIGHEST_PROTOCOL >= 5 and not PYPY: + # Pickler is the C implementation of the CPython pickler and therefore + # we rely on reduce_override method to customize the pickler behavior. + # `CloudPickler.dispatch` is only left for backward compatibility - note # that when using protocol 5, `CloudPickler.dispatch` is not an # extension of `Pickler.dispatch` dictionary, because CloudPickler @@ -579,7 +674,7 @@ def dump(self, obj): # `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler # used `CloudPickler.dispatch` as a class-level attribute storing all # reducers implemented by cloudpickle, but the attribute name was not a - # great choice given the meaning of `Cloudpickler.dispatch` when + # great choice given the meaning of `CloudPickler.dispatch` when # `CloudPickler` extends the pure-python pickler. dispatch = dispatch_table @@ -592,17 +687,6 @@ def dump(self, obj): # availability of both notions coincide on CPython's pickle and the # pickle5 backport, but it may not be the case anymore when pypy # implements protocol 5 - def __init__(self, file, protocol=None, buffer_callback=None): - if protocol is None: - protocol = DEFAULT_PROTOCOL - Pickler.__init__( - self, file, protocol=protocol, buffer_callback=buffer_callback - ) - # map functions __globals__ attribute ids, to ensure that functions - # sharing the same global namespace at pickling time also share - # their global namespace at unpickling time. - self.globals_ref = {} - self.proto = int(protocol) def reducer_override(self, obj): """Type-agnostic reducing callback for function and classes. @@ -653,7 +737,7 @@ def reducer_override(self, obj): return self._function_reduce(obj) else: # fallback to save_global, including the Pickler's - # distpatch_table + # dispatch_table return NotImplemented else: @@ -663,16 +747,6 @@ def reducer_override(self, obj): # hard-coded call to save_global when pickling meta-classes. dispatch = Pickler.dispatch.copy() - def __init__(self, file, protocol=None): - if protocol is None: - protocol = DEFAULT_PROTOCOL - Pickler.__init__(self, file, protocol=protocol) - # map functions __globals__ attribute ids, to ensure that functions - # sharing the same global namespace at pickling time also share - # their global namespace at unpickling time. - self.globals_ref = {} - assert hasattr(self, 'proto') - def _save_reduce_pickle5(self, func, args, state=None, listitems=None, dictitems=None, state_setter=None, obj=None): save = self.save @@ -724,7 +798,7 @@ def save_global(self, obj, name=None, pack=struct.pack): ) elif name is not None: Pickler.save_global(self, obj, name=name) - elif not _is_importable(obj, name=name): + elif not _should_pickle_by_reference(obj, name=name): self._save_reduce_pickle5(*_dynamic_class_reduce(obj), obj=obj) else: Pickler.save_global(self, obj, name=name) @@ -736,7 +810,7 @@ def save_function(self, obj, name=None): Determines what kind of function obj is (e.g. lambda, defined at interactive prompt, etc) and handles the pickling appropriately. """ - if _is_importable(obj, name=name): + if _should_pickle_by_reference(obj, name=name): return Pickler.save_global(self, obj, name=name) elif PYPY and isinstance(obj.__code__, builtin_code_type): return self.save_pypy_builtin_func(obj) diff --git a/joblib/externals/cloudpickle/compat.py b/joblib/externals/cloudpickle/compat.py index afa285f62..5e9b52773 100644 --- a/joblib/externals/cloudpickle/compat.py +++ b/joblib/externals/cloudpickle/compat.py @@ -7,7 +7,12 @@ from pickle5 import Pickler # noqa: F401 except ImportError: import pickle # noqa: F401 + + # Use the Python pickler for old CPython versions from pickle import _Pickler as Pickler # noqa: F401 else: import pickle # noqa: F401 - from _pickle import Pickler # noqa: F401 + + # Pickler will the C implementation in CPython and the Python + # implementation in PyPy + from pickle import Pickler # noqa: F401 diff --git a/joblib/externals/loky/__init__.py b/joblib/externals/loky/__init__.py index 21f3bb6b9..fd2008d78 100644 --- a/joblib/externals/loky/__init__.py +++ b/joblib/externals/loky/__init__.py @@ -3,11 +3,18 @@ :class:`ProcessPoolExecutor` and a function :func:`get_reusable_executor` which hide the pool management under the hood. """ -from ._base import Executor, Future -from ._base import wait, as_completed -from ._base import TimeoutError, CancelledError -from ._base import ALL_COMPLETED, FIRST_COMPLETED, FIRST_EXCEPTION +from concurrent.futures import ( + ALL_COMPLETED, + FIRST_COMPLETED, + FIRST_EXCEPTION, + CancelledError, + Executor, + TimeoutError, + as_completed, + wait, +) +from ._base import Future from .backend.context import cpu_count from .backend.reduction import set_loky_pickler from .reusable_executor import get_reusable_executor @@ -22,4 +29,4 @@ "wrap_non_picklable_objects", "set_loky_pickler"] -__version__ = '2.9.0' +__version__ = '3.3.0' diff --git a/joblib/externals/loky/_base.py b/joblib/externals/loky/_base.py index 92422bbf3..cd8f34100 100644 --- a/joblib/externals/loky/_base.py +++ b/joblib/externals/loky/_base.py @@ -1,5 +1,5 @@ ############################################################################### -# Backport concurrent.futures for python2.7/3.3 +# Modification of concurrent.futures.Future # # author: Thomas Moreau and Olivier Grisel # @@ -10,618 +10,19 @@ # Copyright 2009 Brian Quinlan. All Rights Reserved. # Licensed to PSF under a Contributor Agreement. -import sys -import time -import logging -import threading -import collections - - -if sys.version_info[:2] >= (3, 3): - - from concurrent.futures import wait, as_completed - from concurrent.futures import TimeoutError, CancelledError - from concurrent.futures import Executor, Future as _BaseFuture - - from concurrent.futures import FIRST_EXCEPTION - from concurrent.futures import ALL_COMPLETED, FIRST_COMPLETED - - from concurrent.futures._base import LOGGER - from concurrent.futures._base import PENDING, RUNNING, CANCELLED - from concurrent.futures._base import CANCELLED_AND_NOTIFIED, FINISHED -else: - - FIRST_COMPLETED = 'FIRST_COMPLETED' - FIRST_EXCEPTION = 'FIRST_EXCEPTION' - ALL_COMPLETED = 'ALL_COMPLETED' - _AS_COMPLETED = '_AS_COMPLETED' - - # Possible future states (for internal use by the futures package). - PENDING = 'PENDING' - RUNNING = 'RUNNING' - # The future was cancelled by the user... - CANCELLED = 'CANCELLED' - # ...and _Waiter.add_cancelled() was called by a worker. - CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED' - FINISHED = 'FINISHED' - - _FUTURE_STATES = [ - PENDING, - RUNNING, - CANCELLED, - CANCELLED_AND_NOTIFIED, - FINISHED - ] - - _STATE_TO_DESCRIPTION_MAP = { - PENDING: "pending", - RUNNING: "running", - CANCELLED: "cancelled", - CANCELLED_AND_NOTIFIED: "cancelled", - FINISHED: "finished" - } - - # Logger for internal use by the futures package. - LOGGER = logging.getLogger("concurrent.futures") - - class Error(Exception): - """Base class for all future-related exceptions.""" - pass - - class CancelledError(Error): - """The Future was cancelled.""" - pass - - class TimeoutError(Error): - """The operation exceeded the given deadline.""" - pass - - class _Waiter(object): - """Provides the event that wait() and as_completed() block on.""" - def __init__(self): - self.event = threading.Event() - self.finished_futures = [] - - def add_result(self, future): - self.finished_futures.append(future) - - def add_exception(self, future): - self.finished_futures.append(future) - - def add_cancelled(self, future): - self.finished_futures.append(future) - - class _AsCompletedWaiter(_Waiter): - """Used by as_completed().""" - - def __init__(self): - super(_AsCompletedWaiter, self).__init__() - self.lock = threading.Lock() - - def add_result(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_exception(future) - self.event.set() - - def add_cancelled(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_cancelled(future) - self.event.set() - - class _FirstCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_COMPLETED).""" - - def add_result(self, future): - super(_FirstCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - super(_FirstCompletedWaiter, self).add_exception(future) - self.event.set() - - def add_cancelled(self, future): - super(_FirstCompletedWaiter, self).add_cancelled(future) - self.event.set() - - class _AllCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED).""" - - def __init__(self, num_pending_calls, stop_on_exception): - self.num_pending_calls = num_pending_calls - self.stop_on_exception = stop_on_exception - self.lock = threading.Lock() - super(_AllCompletedWaiter, self).__init__() - - def _decrement_pending_calls(self): - with self.lock: - self.num_pending_calls -= 1 - if not self.num_pending_calls: - self.event.set() - - def add_result(self, future): - super(_AllCompletedWaiter, self).add_result(future) - self._decrement_pending_calls() - - def add_exception(self, future): - super(_AllCompletedWaiter, self).add_exception(future) - if self.stop_on_exception: - self.event.set() - else: - self._decrement_pending_calls() - - def add_cancelled(self, future): - super(_AllCompletedWaiter, self).add_cancelled(future) - self._decrement_pending_calls() - - class _AcquireFutures(object): - """A context manager that does an ordered acquire of Future conditions. - """ - - def __init__(self, futures): - self.futures = sorted(futures, key=id) - - def __enter__(self): - for future in self.futures: - future._condition.acquire() - - def __exit__(self, *args): - for future in self.futures: - future._condition.release() - - def _create_and_install_waiters(fs, return_when): - if return_when == _AS_COMPLETED: - waiter = _AsCompletedWaiter() - elif return_when == FIRST_COMPLETED: - waiter = _FirstCompletedWaiter() - else: - pending_count = sum( - f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] - for f in fs) - - if return_when == FIRST_EXCEPTION: - waiter = _AllCompletedWaiter(pending_count, - stop_on_exception=True) - elif return_when == ALL_COMPLETED: - waiter = _AllCompletedWaiter(pending_count, - stop_on_exception=False) - else: - raise ValueError("Invalid return condition: %r" % return_when) - - for f in fs: - f._waiters.append(waiter) - - return waiter - - def as_completed(fs, timeout=None): - """An iterator over the given futures that yields each as it completes. - - Args: - fs: The sequence of Futures (possibly created by different - Executors) to iterate over. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - - Returns: - An iterator that yields the given Futures as they complete - (finished or cancelled). If any given Futures are duplicated, they - will be returned once. - - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - """ - if timeout is not None: - end_time = timeout + time.time() - - fs = set(fs) - with _AcquireFutures(fs): - finished = set( - f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - pending = fs - finished - waiter = _create_and_install_waiters(fs, _AS_COMPLETED) - - try: - for future in finished: - yield future - - while pending: - if timeout is None: - wait_timeout = None - else: - wait_timeout = end_time - time.time() - if wait_timeout < 0: - raise TimeoutError('%d (of %d) futures unfinished' % ( - len(pending), len(fs))) - - waiter.event.wait(wait_timeout) - - with waiter.lock: - finished = waiter.finished_futures - waiter.finished_futures = [] - waiter.event.clear() - - for future in finished: - yield future - pending.remove(future) - - finally: - for f in fs: - with f._condition: - f._waiters.remove(waiter) - - DoneAndNotDoneFutures = collections.namedtuple( - 'DoneAndNotDoneFutures', 'done not_done') - - def wait(fs, timeout=None, return_when=ALL_COMPLETED): - """Wait for the futures in the given sequence to complete. - - Args: - fs: The sequence of Futures (possibly created by different - Executors) to wait upon. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - return_when: Indicates when this function should return. The - options are: - - FIRST_COMPLETED - Return when any future finishes or is - cancelled. - FIRST_EXCEPTION - Return when any future finishes by raising an - exception. If no future raises an exception - then it is equivalent to ALL_COMPLETED. - ALL_COMPLETED - Return when all futures finish or are - cancelled. - - Returns: - A named 2-tuple of sets. The first set, named 'done', contains the - futures that completed (is finished or cancelled) before the wait - completed. The second set, named 'not_done', contains uncompleted - futures. - """ - with _AcquireFutures(fs): - done = set(f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - not_done = set(fs) - done - - if (return_when == FIRST_COMPLETED) and done: - return DoneAndNotDoneFutures(done, not_done) - elif (return_when == FIRST_EXCEPTION) and done: - if any(f for f in done - if not f.cancelled() and f.exception() is not None): - return DoneAndNotDoneFutures(done, not_done) - - if len(done) == len(fs): - return DoneAndNotDoneFutures(done, not_done) - - waiter = _create_and_install_waiters(fs, return_when) - - waiter.event.wait(timeout) - for f in fs: - with f._condition: - f._waiters.remove(waiter) - - done.update(waiter.finished_futures) - return DoneAndNotDoneFutures(done, set(fs) - done) - - class _BaseFuture(object): - """Represents the result of an asynchronous computation.""" - - def __init__(self): - """Initializes the future. Should not be called by clients.""" - self._condition = threading.Condition() - self._state = PENDING - self._result = None - self._exception = None - self._waiters = [] - self._done_callbacks = [] - - def __repr__(self): - with self._condition: - if self._state == FINISHED: - if self._exception: - return '<%s at %#x state=%s raised %s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._exception.__class__.__name__) - else: - return '<%s at %#x state=%s returned %s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._result.__class__.__name__) - return '<%s at %#x state=%s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state]) - - def cancel(self): - """Cancel the future if possible. - - Returns True if the future was cancelled, False otherwise. A future - cannot be cancelled if it is running or has already completed. - """ - with self._condition: - if self._state in [RUNNING, FINISHED]: - return False - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - return True - - self._state = CANCELLED - self._condition.notify_all() - - self._invoke_callbacks() - return True - - def cancelled(self): - """Return True if the future was cancelled.""" - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED] - - def running(self): - """Return True if the future is currently executing.""" - with self._condition: - return self._state == RUNNING - - def done(self): - """Return True of the future was cancelled or finished executing. - """ - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, - FINISHED] - - def __get_result(self): - if self._exception: - raise self._exception - else: - return self._result - - def add_done_callback(self, fn): - """Attaches a callable that will be called when the future finishes. - - Args: - fn: A callable that will be called with this future as its only - argument when the future completes or is cancelled. The - callable will always be called by a thread in the same - process in which it was added. If the future has already - completed or been cancelled then the callable will be - called immediately. These callables are called in the order - that they were added. - """ - with self._condition: - if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, - FINISHED]: - self._done_callbacks.append(fn) - return - fn(self) - - def result(self, timeout=None): - """Return the result of the call that the future represents. - - Args: - timeout: The number of seconds to wait for the result if the - future isn't done. If None, then there is no limit on the - wait time. - - Returns: - The result of the call that the future represents. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the - given timeout. - Exception: If the call raised then that exception will be - raised. - """ - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - else: - raise TimeoutError() - - def exception(self, timeout=None): - """Return the exception raised by the call that the future - represents. - - Args: - timeout: The number of seconds to wait for the exception if the - future isn't done. If None, then there is no limit on the - wait time. - - Returns: - The exception raised by the call that the future represents or - None if the call completed without raising. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the - given timeout. - """ - - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception - else: - raise TimeoutError() - - # The following methods should only be used by Executors and in tests. - def set_running_or_notify_cancel(self): - """Mark the future as running or process any cancel notifications. - - Should only be used by Executor implementations and unit tests. - - If the future has been cancelled (cancel() was called and returned - True) then any threads waiting on the future completing (though - calls to as_completed() or wait()) are notified and False is - returned. - - If the future was not cancelled then it is put in the running state - (future calls to running() will return True) and True is returned. - - This method should be called by Executor implementations before - executing the work associated with this future. If this method - returns False then the work should not be executed. - - Returns: - False if the Future was cancelled, True otherwise. - - Raises: - RuntimeError: if this method was already called or if - set_result() or set_exception() was called. - """ - with self._condition: - if self._state == CANCELLED: - self._state = CANCELLED_AND_NOTIFIED - for waiter in self._waiters: - waiter.add_cancelled(self) - # self._condition.notify_all() is not necessary because - # self.cancel() triggers a notification. - return False - elif self._state == PENDING: - self._state = RUNNING - return True - else: - LOGGER.critical('Future %s in unexpected state: %s', - id(self), - self._state) - raise RuntimeError('Future in unexpected state') - - def set_result(self, result): - """Sets the return value of work associated with the future. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._result = result - self._state = FINISHED - for waiter in self._waiters: - waiter.add_result(self) - self._condition.notify_all() - self._invoke_callbacks() - - def set_exception(self, exception): - """Sets the result of the future as being the given exception. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._exception = exception - self._state = FINISHED - for waiter in self._waiters: - waiter.add_exception(self) - self._condition.notify_all() - self._invoke_callbacks() - - class Executor(object): - """This is an abstract base class for concrete asynchronous executors. - """ - - def submit(self, fn, *args, **kwargs): - """Submits a callable to be executed with the given arguments. - - Schedules the callable to be executed as fn(*args, **kwargs) and - returns a Future instance representing the execution of the - callable. - - Returns: - A Future representing the given call. - """ - raise NotImplementedError() - - def map(self, fn, *iterables, **kwargs): - """Returns an iterator equivalent to map(fn, iter). - - Args: - fn: A callable that will take as many arguments as there are - passed iterables. - timeout: The maximum number of seconds to wait. If None, then - there is no limit on the wait time. - chunksize: The size of the chunks the iterable will be broken - into before being passed to a child process. This argument - is only used by ProcessPoolExecutor; it is ignored by - ThreadPoolExecutor. - - Returns: - An iterator equivalent to: map(func, *iterables) but the calls - may be evaluated out-of-order. - - Raises: - TimeoutError: If the entire result iterator could not be - generated before the given timeout. - Exception: If fn(*args) raises for any values. - """ - timeout = kwargs.get('timeout') - if timeout is not None: - end_time = timeout + time.time() - - fs = [self.submit(fn, *args) for args in zip(*iterables)] - - # Yield must be hidden in closure so that the futures are submitted - # before the first iterator value is required. - def result_iterator(): - try: - for future in fs: - if timeout is None: - yield future.result() - else: - yield future.result(end_time - time.time()) - finally: - for future in fs: - future.cancel() - return result_iterator() - - def shutdown(self, wait=True): - """Clean-up the resources associated with the Executor. - - It is safe to call this method several times. Otherwise, no other - methods can be called after this one. - - Args: - wait: If True then shutdown will not return until all running - futures have finished executing and the resources used by - the executor have been reclaimed. - """ - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.shutdown(wait=True) - return False +from concurrent.futures import Future as _BaseFuture +from concurrent.futures._base import LOGGER # To make loky._base.Future instances awaitable by concurrent.futures.wait, # derive our custom Future class from _BaseFuture. _invoke_callback is the only # modification made to this class in loky. +# TODO investigate why using `concurrent.futures.Future` directly does not +# always work in our test suite. class Future(_BaseFuture): def _invoke_callbacks(self): for callback in self._done_callbacks: try: callback(self) except BaseException: - LOGGER.exception('exception calling callback for %r', self) + LOGGER.exception(f'exception calling callback for {self!r}') diff --git a/joblib/externals/loky/backend/__init__.py b/joblib/externals/loky/backend/__init__.py index a65ce0e8b..c31023cc5 100644 --- a/joblib/externals/loky/backend/__init__.py +++ b/joblib/externals/loky/backend/__init__.py @@ -1,16 +1,14 @@ import os -import sys +from multiprocessing import synchronize from .context import get_context -if sys.version_info > (3, 4): - def _make_name(): - name = '/loky-%i-%s' % (os.getpid(), next(synchronize.SemLock._rand)) - return name +def _make_name(): + return f'/loky-{os.getpid()}-{next(synchronize.SemLock._rand)}' - # monkey patch the name creation for multiprocessing - from multiprocessing import synchronize - synchronize.SemLock._make_name = staticmethod(_make_name) + +# monkey patch the name creation for multiprocessing +synchronize.SemLock._make_name = staticmethod(_make_name) __all__ = ["get_context"] diff --git a/joblib/externals/loky/backend/_posix_reduction.py b/joblib/externals/loky/backend/_posix_reduction.py index e0e394d3c..e9f34ed56 100644 --- a/joblib/externals/loky/backend/_posix_reduction.py +++ b/joblib/externals/loky/backend/_posix_reduction.py @@ -7,18 +7,12 @@ # * Add adapted reduction for LokyProcesses and socket/Connection # import os -import sys import socket import _socket +from multiprocessing.connection import Connection +from multiprocessing.context import get_spawning_popen from .reduction import register -from .context import get_spawning_popen - -if sys.version_info >= (3, 3): - from multiprocessing.connection import Connection -else: - from _multiprocessing import Connection - HAVE_SEND_HANDLE = (hasattr(socket, 'CMSG_LEN') and hasattr(socket, 'SCM_RIGHTS') and @@ -26,8 +20,7 @@ def _mk_inheritable(fd): - if sys.version_info[:2] > (3, 3): - os.set_inheritable(fd, True) + os.set_inheritable(fd, True) return fd @@ -36,7 +29,7 @@ def DupFd(fd): popen_obj = get_spawning_popen() if popen_obj is not None: return popen_obj.DupFd(popen_obj.duplicate_for_child(fd)) - elif HAVE_SEND_HANDLE and sys.version_info[:2] > (3, 3): + elif HAVE_SEND_HANDLE: from multiprocessing import resource_sharer return resource_sharer.DupFd(fd) else: @@ -46,31 +39,26 @@ def DupFd(fd): ) -if sys.version_info[:2] != (3, 3): - def _reduce_socket(s): - df = DupFd(s.fileno()) - return _rebuild_socket, (df, s.family, s.type, s.proto) +def _reduce_socket(s): + df = DupFd(s.fileno()) + return _rebuild_socket, (df, s.family, s.type, s.proto) - def _rebuild_socket(df, family, type, proto): - fd = df.detach() - return socket.fromfd(fd, family, type, proto) -else: - from multiprocessing.reduction import reduce_socket as _reduce_socket +def _rebuild_socket(df, family, type, proto): + fd = df.detach() + return socket.fromfd(fd, family, type, proto) -register(socket.socket, _reduce_socket) -register(_socket.socket, _reduce_socket) +def rebuild_connection(df, readable, writable): + fd = df.detach() + return Connection(fd, readable, writable) -if sys.version_info[:2] != (3, 3): - def reduce_connection(conn): - df = DupFd(conn.fileno()) - return rebuild_connection, (df, conn.readable, conn.writable) - def rebuild_connection(df, readable, writable): - fd = df.detach() - return Connection(fd, readable, writable) -else: - from multiprocessing.reduction import reduce_connection +def reduce_connection(conn): + df = DupFd(conn.fileno()) + return rebuild_connection, (df, conn.readable, conn.writable) + +register(socket.socket, _reduce_socket) +register(_socket.socket, _reduce_socket) register(Connection, reduce_connection) diff --git a/joblib/externals/loky/backend/_posix_wait.py b/joblib/externals/loky/backend/_posix_wait.py deleted file mode 100644 index d935882dc..000000000 --- a/joblib/externals/loky/backend/_posix_wait.py +++ /dev/null @@ -1,105 +0,0 @@ -############################################################################### -# Compat for wait function on UNIX based system -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/connection.py (17/02/2017) -# * Backport wait function to python2.7 -# - -import platform -import select -import socket -import errno -SYSTEM = platform.system() - -try: - import ctypes -except ImportError: # pragma: no cover - ctypes = None # noqa - -if SYSTEM == 'Darwin' and ctypes is not None: - from ctypes.util import find_library - libSystem = ctypes.CDLL(find_library('libSystem.dylib')) - CoreServices = ctypes.CDLL(find_library('CoreServices'), - use_errno=True) - mach_absolute_time = libSystem.mach_absolute_time - mach_absolute_time.restype = ctypes.c_uint64 - absolute_to_nanoseconds = CoreServices.AbsoluteToNanoseconds - absolute_to_nanoseconds.restype = ctypes.c_uint64 - absolute_to_nanoseconds.argtypes = [ctypes.c_uint64] - - def monotonic(): - return absolute_to_nanoseconds(mach_absolute_time()) * 1e-9 - -elif SYSTEM == 'Linux' and ctypes is not None: - # from stackoverflow: - # questions/1205722/how-do-i-get-monotonic-time-durations-in-python - import ctypes - import os - - CLOCK_MONOTONIC = 1 # see - - class timespec(ctypes.Structure): - _fields_ = [ - ('tv_sec', ctypes.c_long), - ('tv_nsec', ctypes.c_long), - ] - - librt = ctypes.CDLL('librt.so.1', use_errno=True) - clock_gettime = librt.clock_gettime - clock_gettime.argtypes = [ - ctypes.c_int, ctypes.POINTER(timespec), - ] - - def monotonic(): # noqa - t = timespec() - if clock_gettime(CLOCK_MONOTONIC, ctypes.pointer(t)) != 0: - errno_ = ctypes.get_errno() - raise OSError(errno_, os.strerror(errno_)) - return t.tv_sec + t.tv_nsec * 1e-9 -else: # pragma: no cover - from time import time as monotonic - - -if hasattr(select, 'poll'): - def _poll(fds, timeout): - if timeout is not None: - timeout = int(timeout * 1000) # timeout is in milliseconds - fd_map = {} - pollster = select.poll() - for fd in fds: - pollster.register(fd, select.POLLIN) - if hasattr(fd, 'fileno'): - fd_map[fd.fileno()] = fd - else: - fd_map[fd] = fd - ls = [] - for fd, event in pollster.poll(timeout): - if event & select.POLLNVAL: # pragma: no cover - raise ValueError('invalid file descriptor %i' % fd) - ls.append(fd_map[fd]) - return ls -else: - def _poll(fds, timeout): - return select.select(fds, [], [], timeout)[0] - - -def wait(object_list, timeout=None): - ''' - Wait till an object in object_list is ready/readable. - Returns list of those objects which are ready/readable. - ''' - if timeout is not None: - if timeout <= 0: - return _poll(object_list, 0) - else: - deadline = monotonic() + timeout - while True: - try: - return _poll(object_list, timeout) - except (OSError, IOError, socket.error) as e: # pragma: no cover - if e.errno != errno.EINTR: - raise - if timeout is not None: - timeout = deadline - monotonic() diff --git a/joblib/externals/loky/backend/_win_reduction.py b/joblib/externals/loky/backend/_win_reduction.py index 142e6e7c8..7f50c9f61 100644 --- a/joblib/externals/loky/backend/_win_reduction.py +++ b/joblib/externals/loky/backend/_win_reduction.py @@ -7,93 +7,54 @@ # * Add adapted reduction for LokyProcesses and socket/PipeConnection # import os -import sys import socket -from .reduction import register - - -if sys.platform == 'win32': - if sys.version_info[:2] < (3, 3): - from _multiprocessing import PipeConnection - else: - import _winapi - from multiprocessing.connection import PipeConnection - - -if sys.version_info[:2] >= (3, 4) and sys.platform == 'win32': - class DupHandle(object): - def __init__(self, handle, access, pid=None): - # duplicate handle for process with given pid - if pid is None: - pid = os.getpid() - proc = _winapi.OpenProcess(_winapi.PROCESS_DUP_HANDLE, False, pid) - try: - self._handle = _winapi.DuplicateHandle( - _winapi.GetCurrentProcess(), - handle, proc, access, False, 0) - finally: - _winapi.CloseHandle(proc) - self._access = access - self._pid = pid - - def detach(self): - # retrieve handle from process which currently owns it - if self._pid == os.getpid(): - return self._handle - proc = _winapi.OpenProcess(_winapi.PROCESS_DUP_HANDLE, False, - self._pid) - try: - return _winapi.DuplicateHandle( - proc, self._handle, _winapi.GetCurrentProcess(), - self._access, False, _winapi.DUPLICATE_CLOSE_SOURCE) - finally: - _winapi.CloseHandle(proc) - - def reduce_pipe_connection(conn): - access = ((_winapi.FILE_GENERIC_READ if conn.readable else 0) | - (_winapi.FILE_GENERIC_WRITE if conn.writable else 0)) - dh = DupHandle(conn.fileno(), access) - return rebuild_pipe_connection, (dh, conn.readable, conn.writable) +import _winapi +from multiprocessing.connection import PipeConnection +from multiprocessing.reduction import _reduce_socket - def rebuild_pipe_connection(dh, readable, writable): - from multiprocessing.connection import PipeConnection - handle = dh.detach() - return PipeConnection(handle, readable, writable) - register(PipeConnection, reduce_pipe_connection) - -elif sys.platform == 'win32': - # Older Python versions - from multiprocessing.reduction import reduce_pipe_connection - register(PipeConnection, reduce_pipe_connection) - - -if sys.version_info[:2] < (3, 3) and sys.platform == 'win32': - from _multiprocessing import win32 - from multiprocessing.reduction import reduce_handle, rebuild_handle - close = win32.CloseHandle - - def fromfd(handle, family, type_, proto=0): - s = socket.socket(family, type_, proto, fileno=handle) - if s.__class__ is not socket.socket: - s = socket.socket(_sock=s) - return s - - def reduce_socket(s): - if not hasattr(socket, "fromfd"): - raise TypeError("sockets cannot be pickled on this system.") - reduced_handle = reduce_handle(s.fileno()) - return _rebuild_socket, (reduced_handle, s.family, s.type, s.proto) +from .reduction import register - def _rebuild_socket(reduced_handle, family, type_, proto): - handle = rebuild_handle(reduced_handle) - s = fromfd(handle, family, type_, proto) - close(handle) - return s - register(socket.socket, reduce_socket) -elif sys.version_info[:2] < (3, 4): - from multiprocessing.reduction import reduce_socket - register(socket.socket, reduce_socket) -else: - from multiprocessing.reduction import _reduce_socket - register(socket.socket, _reduce_socket) +class DupHandle: + def __init__(self, handle, access, pid=None): + # duplicate handle for process with given pid + if pid is None: + pid = os.getpid() + proc = _winapi.OpenProcess(_winapi.PROCESS_DUP_HANDLE, False, pid) + try: + self._handle = _winapi.DuplicateHandle( + _winapi.GetCurrentProcess(), + handle, proc, access, False, 0) + finally: + _winapi.CloseHandle(proc) + self._access = access + self._pid = pid + + def detach(self): + # retrieve handle from process which currently owns it + if self._pid == os.getpid(): + return self._handle + proc = _winapi.OpenProcess(_winapi.PROCESS_DUP_HANDLE, False, + self._pid) + try: + return _winapi.DuplicateHandle( + proc, self._handle, _winapi.GetCurrentProcess(), + self._access, False, _winapi.DUPLICATE_CLOSE_SOURCE) + finally: + _winapi.CloseHandle(proc) + + +def rebuild_pipe_connection(dh, readable, writable): + handle = dh.detach() + return PipeConnection(handle, readable, writable) + + +def reduce_pipe_connection(conn): + access = ((_winapi.FILE_GENERIC_READ if conn.readable else 0) | + (_winapi.FILE_GENERIC_WRITE if conn.writable else 0)) + dh = DupHandle(conn.fileno(), access) + return rebuild_pipe_connection, (dh, conn.readable, conn.writable) + + +register(PipeConnection, reduce_pipe_connection) +register(socket.socket, _reduce_socket) diff --git a/joblib/externals/loky/backend/_win_wait.py b/joblib/externals/loky/backend/_win_wait.py deleted file mode 100644 index 73271316d..000000000 --- a/joblib/externals/loky/backend/_win_wait.py +++ /dev/null @@ -1,58 +0,0 @@ -############################################################################### -# Compat for wait function on Windows system -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/connection.py (17/02/2017) -# * Backport wait function to python2.7 -# - -import ctypes -import sys -from time import sleep - - -if sys.platform == 'win32' and sys.version_info[:2] < (3, 3): - from _subprocess import WaitForSingleObject, WAIT_OBJECT_0 - - try: - from time import monotonic - except ImportError: - # Backward old for crappy old Python that did not have cross-platform - # monotonic clock by default. - - # TODO: do we want to add support for cygwin at some point? See: - # https://github.com/atdt/monotonic/blob/master/monotonic.py - GetTickCount64 = ctypes.windll.kernel32.GetTickCount64 - GetTickCount64.restype = ctypes.c_ulonglong - - def monotonic(): - """Monotonic clock, cannot go backward.""" - return GetTickCount64() / 1000.0 - - def wait(handles, timeout=None): - """Backward compat for python2.7 - - This function wait for either: - * one connection is ready for read, - * one process handle has exited or got killed, - * timeout is reached. Note that this function has a precision of 2 - msec. - """ - if timeout is not None: - deadline = monotonic() + timeout - - while True: - # We cannot use select as in windows it only support sockets - ready = [] - for h in handles: - if type(h) in [int, long]: - if WaitForSingleObject(h, 0) == WAIT_OBJECT_0: - ready += [h] - elif h.poll(0): - ready.append(h) - if len(ready) > 0: - return ready - sleep(.001) - if timeout is not None and deadline - monotonic() <= 0: - return [] diff --git a/joblib/externals/loky/backend/compat.py b/joblib/externals/loky/backend/compat.py deleted file mode 100644 index aa406c6cf..000000000 --- a/joblib/externals/loky/backend/compat.py +++ /dev/null @@ -1,41 +0,0 @@ -############################################################################### -# Compat file to import the correct modules for each platform and python -# version. -# -# author: Thomas Moreau and Olivier grisel -# -import sys - -PY3 = sys.version_info[:2] >= (3, 3) - -if PY3: - import queue -else: - import Queue as queue - -if sys.version_info >= (3, 4): - from multiprocessing.process import BaseProcess -else: - from multiprocessing.process import Process as BaseProcess - -# Platform specific compat -if sys.platform == "win32": - from .compat_win32 import wait -else: - from .compat_posix import wait - - -def set_cause(exc, cause): - exc.__cause__ = cause - - if not PY3: - # Preformat message here. - if exc.__cause__ is not None: - exc.args = ("{}\n\nThis was caused directly by {}".format( - exc.args if len(exc.args) != 1 else exc.args[0], - str(exc.__cause__)),) - - return exc - - -__all__ = ["queue", "BaseProcess", "set_cause", "wait"] diff --git a/joblib/externals/loky/backend/compat_posix.py b/joblib/externals/loky/backend/compat_posix.py deleted file mode 100644 index c8e4e4a43..000000000 --- a/joblib/externals/loky/backend/compat_posix.py +++ /dev/null @@ -1,13 +0,0 @@ -# flake8: noqa -############################################################################### -# Compat file to load the correct wait function -# -# author: Thomas Moreau and Olivier grisel -# -import sys - -# Compat wait -if sys.version_info < (3, 3): - from ._posix_wait import wait -else: - from multiprocessing.connection import wait diff --git a/joblib/externals/loky/backend/compat_win32.py b/joblib/externals/loky/backend/compat_win32.py deleted file mode 100644 index 5df15f55f..000000000 --- a/joblib/externals/loky/backend/compat_win32.py +++ /dev/null @@ -1,46 +0,0 @@ -# flake8: noqa: F401 -import sys -import numbers - -if sys.platform == "win32": - # Avoid import error by code introspection tools such as test runners - # trying to import this module while running on non-Windows systems. - - # Compat Popen - if sys.version_info[:2] >= (3, 4): - from multiprocessing.popen_spawn_win32 import Popen - else: - from multiprocessing.forking import Popen - - # wait compat - if sys.version_info[:2] < (3, 3): - from ._win_wait import wait - else: - from multiprocessing.connection import wait - - # Compat _winapi - if sys.version_info[:2] >= (3, 4): - import _winapi - else: - import os - import msvcrt - if sys.version_info[:2] < (3, 3): - import _subprocess as win_api - from _multiprocessing import win32 - else: - import _winapi as win_api - - class _winapi: - CreateProcess = win_api.CreateProcess - - @staticmethod - def CloseHandle(h): - if isinstance(h, numbers.Integral): - # Cast long to int for 64-bit Python 2.7 under Windows - h = int(h) - if sys.version_info[:2] < (3, 3): - if not isinstance(h, int): - h = h.Detach() - win32.CloseHandle(h) - else: - win_api.CloseHandle(h) diff --git a/joblib/externals/loky/backend/context.py b/joblib/externals/loky/backend/context.py index 76f6520d3..7e551688b 100644 --- a/joblib/externals/loky/backend/context.py +++ b/joblib/externals/loky/backend/context.py @@ -1,6 +1,5 @@ ############################################################################### -# Basic context management with LokyContext and provides -# compat for UNIX 2.7 and 3.3 +# Basic context management with LokyContext # # author: Thomas Moreau and Olivier Grisel # @@ -8,88 +7,46 @@ # * Create a context ensuring loky uses only objects that are compatible # * Add LokyContext to the list of context of multiprocessing so loky can be # used with multiprocessing.set_start_method -# * Add some compat function for python2.7 and 3.3. +# * Implement a CFS-aware amd physical-core aware cpu_count function. # -from __future__ import division - import os import sys +import math import subprocess import traceback import warnings import multiprocessing as mp - +from multiprocessing import get_context as mp_get_context +from multiprocessing.context import BaseContext from .process import LokyProcess, LokyInitMainProcess -START_METHODS = ['loky', 'loky_init_main'] +START_METHODS = ['loky', 'loky_init_main', 'spawn'] +if sys.platform != 'win32': + START_METHODS += ['fork', 'forkserver'] + _DEFAULT_START_METHOD = None # Cache for the number of physical cores to avoid repeating subprocess calls. # It should not change during the lifetime of the program. physical_cores_cache = None -if sys.version_info[:2] >= (3, 4): - from multiprocessing import get_context as mp_get_context - from multiprocessing.context import assert_spawning, set_spawning_popen - from multiprocessing.context import get_spawning_popen, BaseContext - - START_METHODS += ['spawn'] - if sys.platform != 'win32': - START_METHODS += ['fork', 'forkserver'] - - def get_context(method=None): - # Try to overload the default context - method = method or _DEFAULT_START_METHOD or "loky" - if method == "fork": - # If 'fork' is explicitly requested, warn user about potential - # issues. - warnings.warn("`fork` start method should not be used with " - "`loky` as it does not respect POSIX. Try using " - "`spawn` or `loky` instead.", UserWarning) - try: - context = mp_get_context(method) - except ValueError: - raise ValueError("Unknown context '{}'. Value should be in {}." - .format(method, START_METHODS)) - - return context - -else: - if sys.platform != 'win32': - import threading - # Mechanism to check that the current thread is spawning a process - _tls = threading.local() - popen_attr = 'spawning_popen' - else: - from multiprocessing.forking import Popen - _tls = Popen._tls - popen_attr = 'process_handle' - - BaseContext = object - - def get_spawning_popen(): - return getattr(_tls, popen_attr, None) - - def set_spawning_popen(popen): - setattr(_tls, popen_attr, popen) - - def assert_spawning(obj): - if get_spawning_popen() is None: - raise RuntimeError( - '%s objects should only be shared between processes' - ' through inheritance' % type(obj).__name__ - ) - - def get_context(method=None): - method = method or _DEFAULT_START_METHOD or 'loky' - if method == "loky": - return LokyContext() - elif method == "loky_init_main": - return LokyInitMainContext() - else: - raise ValueError("Unknown context '{}'. Value should be in {}." - .format(method, START_METHODS)) + +def get_context(method=None): + # Try to overload the default context + method = method or _DEFAULT_START_METHOD or "loky" + if method == "fork": + # If 'fork' is explicitly requested, warn user about potential issues. + warnings.warn("`fork` start method should not be used with " + "`loky` as it does not respect POSIX. Try using " + "`spawn` or `loky` instead.", UserWarning) + try: + return mp_get_context(method) + except ValueError: + raise ValueError( + f"Unknown context '{method}'. Value should be in " + f"{START_METHODS}." + ) def set_start_method(method, force=False): @@ -97,8 +54,9 @@ def set_start_method(method, force=False): if _DEFAULT_START_METHOD is not None and not force: raise RuntimeError('context has already been set') assert method is None or method in START_METHODS, ( - "'{}' is not a valid start_method. It should be in {}" - .format(method, START_METHODS)) + f"'{method}' is not a valid start_method. It should be in " + f"{START_METHODS}" + ) _DEFAULT_START_METHOD = method @@ -114,8 +72,8 @@ def cpu_count(only_physical_cores=False): * the number of CPUs in the system, as given by ``multiprocessing.cpu_count``; * the CPU affinity settings of the current process - (available with Python 3.4+ on some Unix systems); - * CFS scheduler CPU bandwidth limit (available on Linux only, typically + (available on some Unix systems); + * Cgroup CPU bandwidth limit (available on Linux only, typically set by docker and similar container orchestration systems); * the value of the LOKY_MAX_CPU_COUNT environment variable if defined. and is given as the minimum of these constraints. @@ -123,81 +81,95 @@ def cpu_count(only_physical_cores=False): If ``only_physical_cores`` is True, return the number of physical cores instead of the number of logical cores (hyperthreading / SMT). Note that this option is not enforced if the number of usable cores is controlled in - any other way such as: process affinity, restricting CFS scheduler policy + any other way such as: process affinity, Cgroup restricted CPU bandwidth or the LOKY_MAX_CPU_COUNT environment variable. If the number of physical cores is not found, return the number of logical cores. - + It is also always larger or equal to 1. """ - # TODO: use os.cpu_count when dropping python 2 support - try: - cpu_count_mp = mp.cpu_count() - except NotImplementedError: - cpu_count_mp = 1 - - cpu_count_user = _cpu_count_user(cpu_count_mp) - aggregate_cpu_count = min(cpu_count_mp, cpu_count_user) - - if only_physical_cores: - cpu_count_physical, exception = _count_physical_cores() - if cpu_count_user < cpu_count_mp: - # Respect user setting - cpu_count = max(cpu_count_user, 1) - elif cpu_count_physical == "not found": - # Fallback to default behavior - if exception is not None: - # warns only the first time - warnings.warn( - "Could not find the number of physical cores for the " - "following reason:\n" + str(exception) + "\n" - "Returning the number of logical cores instead. You can " - "silence this warning by setting LOKY_MAX_CPU_COUNT to " - "the number of cores you want to use.") - if sys.version_info >= (3, 5): - # TODO remove the version check when dropping py2 support - traceback.print_tb(exception.__traceback__) - - cpu_count = max(aggregate_cpu_count, 1) - else: - return cpu_count_physical + # Note: os.cpu_count() is allowed to return None in its docstring + os_cpu_count = os.cpu_count() or 1 + + cpu_count_user = _cpu_count_user(os_cpu_count) + aggregate_cpu_count = max(min(os_cpu_count, cpu_count_user), 1) + + if not only_physical_cores: + return aggregate_cpu_count + + if cpu_count_user < os_cpu_count: + # Respect user setting + return max(cpu_count_user, 1) + + cpu_count_physical, exception = _count_physical_cores() + if cpu_count_physical != "not found": + return cpu_count_physical + + # Fallback to default behavior + if exception is not None: + # warns only the first time + warnings.warn( + "Could not find the number of physical cores for the " + f"following reason:\n{exception}\n" + "Returning the number of logical cores instead. You can " + "silence this warning by setting LOKY_MAX_CPU_COUNT to " + "the number of cores you want to use.") + traceback.print_tb(exception.__traceback__) + + return aggregate_cpu_count + + +def _cpu_count_cgroup(os_cpu_count): + # Cgroup CPU bandwidth limit available in Linux since 2.6 kernel + cpu_max_fname = "/sys/fs/cgroup/cpu.max" + cfs_quota_fname = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" + cfs_period_fname = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" + if os.path.exists(cpu_max_fname): + # cgroup v2 + # https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + with open(cpu_max_fname) as fh: + cpu_quota_us, cpu_period_us = fh.read().strip().split() + elif os.path.exists(cfs_quota_fname) and os.path.exists(cfs_period_fname): + # cgroup v1 + # https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html#management + with open(cfs_quota_fname) as fh: + cpu_quota_us = fh.read().strip() + with open(cfs_period_fname) as fh: + cpu_period_us = fh.read().strip() else: - cpu_count = max(aggregate_cpu_count, 1) + # No Cgroup CPU bandwidth limit (e.g. non-Linux platform) + cpu_quota_us = "max" + cpu_period_us = 100_000 # unused, for consistency with default values - return cpu_count + if cpu_quota_us == "max": + # No active Cgroup quota on a Cgroup-capable platform + return os_cpu_count + else: + cpu_quota_us = int(cpu_quota_us) + cpu_period_us = int(cpu_period_us) + if cpu_quota_us > 0 and cpu_period_us > 0: + return math.ceil(cpu_quota_us / cpu_period_us) + else: # pragma: no cover + # Setting a negative cpu_quota_us value is a valid way to disable + # cgroup CPU bandwith limits + return os_cpu_count -def _cpu_count_user(cpu_count_mp): +def _cpu_count_user(os_cpu_count): """Number of user defined available CPUs""" - import math - # Number of available CPUs given affinity settings - cpu_count_affinity = cpu_count_mp + cpu_count_affinity = os_cpu_count if hasattr(os, 'sched_getaffinity'): try: cpu_count_affinity = len(os.sched_getaffinity(0)) except NotImplementedError: pass - # CFS scheduler CPU bandwidth limit - # available in Linux since 2.6 kernel - cpu_count_cfs = cpu_count_mp - cfs_quota_fname = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" - cfs_period_fname = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" - if os.path.exists(cfs_quota_fname) and os.path.exists(cfs_period_fname): - with open(cfs_quota_fname, 'r') as fh: - cfs_quota_us = int(fh.read()) - with open(cfs_period_fname, 'r') as fh: - cfs_period_us = int(fh.read()) - - if cfs_quota_us > 0 and cfs_period_us > 0: - # Make sure this quantity is an int as math.ceil returns a - # float in python2.7. (See issue #165) - cpu_count_cfs = int(math.ceil(cfs_quota_us / cfs_period_us)) + cpu_count_cgroup = _cpu_count_cgroup(os_cpu_count) # User defined soft-limit passed as a loky specific environment variable. - cpu_count_loky = int(os.environ.get('LOKY_MAX_CPU_COUNT', cpu_count_mp)) + cpu_count_loky = int(os.environ.get('LOKY_MAX_CPU_COUNT', os_cpu_count)) - return min(cpu_count_affinity, cpu_count_cfs, cpu_count_loky) + return min(cpu_count_affinity, cpu_count_cgroup, cpu_count_loky) def _count_physical_cores(): @@ -219,39 +191,39 @@ def _count_physical_cores(): try: if sys.platform == "linux": cpu_info = subprocess.run( - "lscpu --parse=core".split(" "), capture_output=True) - cpu_info = cpu_info.stdout.decode("utf-8").splitlines() + "lscpu --parse=core".split(), capture_output=True, text=True) + cpu_info = cpu_info.stdout.splitlines() cpu_info = {line for line in cpu_info if not line.startswith("#")} cpu_count_physical = len(cpu_info) elif sys.platform == "win32": cpu_info = subprocess.run( - "wmic CPU Get NumberOfCores /Format:csv".split(" "), - capture_output=True) - cpu_info = cpu_info.stdout.decode('utf-8').splitlines() + "wmic CPU Get NumberOfCores /Format:csv".split(), + capture_output=True, text=True) + cpu_info = cpu_info.stdout.splitlines() cpu_info = [l.split(",")[1] for l in cpu_info if (l and l != "Node,NumberOfCores")] cpu_count_physical = sum(map(int, cpu_info)) elif sys.platform == "darwin": cpu_info = subprocess.run( - "sysctl -n hw.physicalcpu".split(" "), capture_output=True) - cpu_info = cpu_info.stdout.decode('utf-8') + "sysctl -n hw.physicalcpu".split(), + capture_output=True, text=True) + cpu_info = cpu_info.stdout cpu_count_physical = int(cpu_info) else: - raise NotImplementedError( - "unsupported platform: {}".format(sys.platform)) + raise NotImplementedError(f"unsupported platform: {sys.platform}") # if cpu_count_physical < 1, we did not find a valid value if cpu_count_physical < 1: raise ValueError( - "found {} physical cores < 1".format(cpu_count_physical)) - + f"found {cpu_count_physical} physical cores < 1") + except Exception as e: exception = e cpu_count_physical = "not found" # Put the result in cache physical_cores_cache = cpu_count_physical - + return cpu_count_physical, exception @@ -272,44 +244,10 @@ def SimpleQueue(self, reducers=None): from .queues import SimpleQueue return SimpleQueue(reducers=reducers, ctx=self.get_context()) - if sys.version_info[:2] < (3, 4): - """Compat for python2.7/3.3 for necessary methods in Context""" - def get_context(self): - return self - - def get_start_method(self): - return self._name - - def Pipe(self, duplex=True): - '''Returns two connection object connected by a pipe''' - return mp.Pipe(duplex) - - if sys.platform != "win32": - """Use the compat Manager for python2.7/3.3 on UNIX to avoid - relying on fork processes - """ - def Manager(self): - """Returns a manager object""" - from .managers import LokyManager - m = LokyManager() - m.start() - return m - else: - """Compat for context on Windows and python2.7/3.3. Using regular - multiprocessing objects as it does not rely on fork. - """ - from multiprocessing import synchronize - Semaphore = staticmethod(synchronize.Semaphore) - BoundedSemaphore = staticmethod(synchronize.BoundedSemaphore) - Lock = staticmethod(synchronize.Lock) - RLock = staticmethod(synchronize.RLock) - Condition = staticmethod(synchronize.Condition) - Event = staticmethod(synchronize.Event) - Manager = staticmethod(mp.Manager) - if sys.platform != "win32": """For Unix platform, use our custom implementation of synchronize - relying on ctypes to interface with pthread semaphores. + ensuring that we use the loky.backend.resource_tracker to clean-up + the semaphores in case of a worker crash. """ def Semaphore(self, value=1): """Returns a semaphore object""" @@ -352,7 +290,7 @@ class LokyInitMainContext(LokyContext): functions and variable used from main should be out of this block. This mimics the default behavior of multiprocessing under Windows and the - behavior of the ``spawn`` start method on a posix system for python3.4+. + behavior of the ``spawn`` start method on a posix system. For more details, see the end of the following section of python doc https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming """ @@ -360,8 +298,7 @@ class LokyInitMainContext(LokyContext): Process = LokyInitMainProcess -if sys.version_info > (3, 4): - """Register loky context so it works with multiprocessing.get_context""" - ctx_loky = LokyContext() - mp.context._concrete_contexts['loky'] = ctx_loky - mp.context._concrete_contexts['loky_init_main'] = LokyInitMainContext() +# Register loky context so it works with multiprocessing.get_context +ctx_loky = LokyContext() +mp.context._concrete_contexts['loky'] = ctx_loky +mp.context._concrete_contexts['loky_init_main'] = LokyInitMainContext() diff --git a/joblib/externals/loky/backend/fork_exec.py b/joblib/externals/loky/backend/fork_exec.py index cfb68dc4e..211d1835a 100644 --- a/joblib/externals/loky/backend/fork_exec.py +++ b/joblib/externals/loky/backend/fork_exec.py @@ -7,24 +7,20 @@ import os import sys -if sys.platform == "darwin" and sys.version_info < (3, 3): - FileNotFoundError = OSError - def close_fds(keep_fds): # pragma: no cover """Close all the file descriptors except those in keep_fds.""" # Make sure to keep stdout and stderr open for logging purpose - keep_fds = set(keep_fds).union([1, 2]) + keep_fds = {*keep_fds, 1, 2} # We try to retrieve all the open fds try: - open_fds = set(int(fd) for fd in os.listdir('/proc/self/fd')) + open_fds = {int(fd) for fd in os.listdir('/proc/self/fd')} except FileNotFoundError: import resource max_nfds = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - open_fds = set(fd for fd in range(3, max_nfds)) - open_fds.add(0) + open_fds = {*range(max_nfds)} for i in open_fds - keep_fds: try: @@ -34,11 +30,9 @@ def close_fds(keep_fds): # pragma: no cover def fork_exec(cmd, keep_fds, env=None): - # copy the environment variables to set in the child process - env = {} if env is None else env - child_env = os.environ.copy() - child_env.update(env) + env = env or {} + child_env = {**os.environ, **env} pid = os.fork() if pid == 0: # pragma: no cover diff --git a/joblib/externals/loky/backend/managers.py b/joblib/externals/loky/backend/managers.py deleted file mode 100644 index 081f8976e..000000000 --- a/joblib/externals/loky/backend/managers.py +++ /dev/null @@ -1,51 +0,0 @@ -############################################################################### -# compat for UNIX 2.7 and 3.3 -# Manager with LokyContext server. -# This avoids having a Manager using fork and breaks the fd. -# -# author: Thomas Moreau and Olivier Grisel -# -# based on multiprocessing/managers.py (17/02/2017) -# * Overload the start method to use LokyContext and launch a loky subprocess -# - -import multiprocessing as mp -from multiprocessing.managers import SyncManager, State -from .process import LokyProcess as Process - - -class LokyManager(SyncManager): - def start(self, initializer=None, initargs=()): - '''Spawn a server process for this manager object''' - assert self._state.value == State.INITIAL - - if (initializer is not None - and not hasattr(initializer, '__call__')): - raise TypeError('initializer must be a callable') - - # pipe over which we will retrieve address of server - reader, writer = mp.Pipe(duplex=False) - - # spawn process which runs a server - self._process = Process( - target=type(self)._run_server, - args=(self._registry, self._address, bytes(self._authkey), - self._serializer, writer, initializer, initargs), - ) - ident = ':'.join(str(i) for i in self._process._identity) - self._process.name = type(self).__name__ + '-' + ident - self._process.start() - - # get address of server - writer.close() - self._address = reader.recv() - reader.close() - - # register a finalizer - self._state.value = State.STARTED - self.shutdown = mp.util.Finalize( - self, type(self)._finalize_manager, - args=(self._process, self._address, self._authkey, - self._state, self._Client), - exitpriority=0 - ) diff --git a/joblib/externals/loky/backend/popen_loky_posix.py b/joblib/externals/loky/backend/popen_loky_posix.py index 970dead0b..37a73172e 100644 --- a/joblib/externals/loky/backend/popen_loky_posix.py +++ b/joblib/externals/loky/backend/popen_loky_posix.py @@ -8,173 +8,141 @@ import signal import pickle from io import BytesIO - -from . import reduction, spawn -from .context import get_spawning_popen, set_spawning_popen from multiprocessing import util, process +from multiprocessing.connection import wait +from multiprocessing.context import set_spawning_popen -if sys.version_info[:2] < (3, 3): - ProcessLookupError = OSError - -if sys.platform != "win32": - from . import resource_tracker - - -__all__ = [] - -if sys.platform != "win32": - # - # Wrapper for an fd used while launching a process - # - - class _DupFd(object): - def __init__(self, fd): - self.fd = reduction._mk_inheritable(fd) - - def detach(self): - return self.fd - - # - # Start child process using subprocess.Popen - # - - __all__.append('Popen') - - class Popen(object): - method = 'loky' - DupFd = _DupFd - - def __init__(self, process_obj): - sys.stdout.flush() - sys.stderr.flush() - self.returncode = None - self._fds = [] - self._launch(process_obj) - - if sys.version_info < (3, 4): - @classmethod - def duplicate_for_child(cls, fd): - popen = get_spawning_popen() - popen._fds.append(fd) - return reduction._mk_inheritable(fd) - - else: - def duplicate_for_child(self, fd): - self._fds.append(fd) - return reduction._mk_inheritable(fd) - - def poll(self, flag=os.WNOHANG): - if self.returncode is None: - while True: - try: - pid, sts = os.waitpid(self.pid, flag) - except OSError: - # Child process not yet created. See #1731717 - # e.errno == errno.ECHILD == 10 - return None - else: - break - if pid == self.pid: - if os.WIFSIGNALED(sts): - self.returncode = -os.WTERMSIG(sts) - else: - assert os.WIFEXITED(sts) - self.returncode = os.WEXITSTATUS(sts) - return self.returncode - - def wait(self, timeout=None): - if sys.version_info < (3, 3): - import time - if timeout is None: - return self.poll(0) - deadline = time.time() + timeout - delay = 0.0005 - while 1: - res = self.poll() - if res is not None: - break - remaining = deadline - time.time() - if remaining <= 0: - break - delay = min(delay * 2, remaining, 0.05) - time.sleep(delay) - return res - - if self.returncode is None: - if timeout is not None: - from multiprocessing.connection import wait - if not wait([self.sentinel], timeout): - return None - # This shouldn't block if wait() returned successfully. - return self.poll(os.WNOHANG if timeout == 0.0 else 0) - return self.returncode - - def terminate(self): - if self.returncode is None: - try: - os.kill(self.pid, signal.SIGTERM) - except ProcessLookupError: - pass - except OSError: - if self.wait(timeout=0.1) is None: - raise +from . import reduction, resource_tracker, spawn - def _launch(self, process_obj): - tracker_fd = resource_tracker._resource_tracker.getfd() +__all__ = ['Popen'] - fp = BytesIO() - set_spawning_popen(self) - try: - prep_data = spawn.get_preparation_data( - process_obj._name, - getattr(process_obj, "init_main_module", True)) - reduction.dump(prep_data, fp) - reduction.dump(process_obj, fp) - finally: - set_spawning_popen(None) +# +# Wrapper for an fd used while launching a process +# - try: - parent_r, child_w = os.pipe() - child_r, parent_w = os.pipe() - # for fd in self._fds: - # _mk_inheritable(fd) - - cmd_python = [sys.executable] - cmd_python += ['-m', self.__module__] - cmd_python += ['--process-name', str(process_obj.name)] - cmd_python += ['--pipe', - str(reduction._mk_inheritable(child_r))] - reduction._mk_inheritable(child_w) - reduction._mk_inheritable(tracker_fd) - self._fds.extend([child_r, child_w, tracker_fd]) - if sys.version_info >= (3, 8) and os.name == 'posix': - mp_tracker_fd = prep_data['mp_tracker_args']['fd'] - self.duplicate_for_child(mp_tracker_fd) - - from .fork_exec import fork_exec - pid = fork_exec(cmd_python, self._fds, env=process_obj.env) - util.debug("launched python with pid {} and cmd:\n{}" - .format(pid, cmd_python)) - self.sentinel = parent_r - - method = 'getbuffer' - if not hasattr(fp, method): - method = 'getvalue' - with os.fdopen(parent_w, 'wb') as f: - f.write(getattr(fp, method)()) - self.pid = pid - finally: - if parent_r is not None: - util.Finalize(self, os.close, (parent_r,)) - for fd in (child_r, child_w): - if fd is not None: - os.close(fd) +class _DupFd: + def __init__(self, fd): + self.fd = reduction._mk_inheritable(fd) - @staticmethod - def thread_is_spawning(): - return True + def detach(self): + return self.fd + + +# +# Start child process using subprocess.Popen +# + +class Popen: + method = 'loky' + DupFd = _DupFd + + def __init__(self, process_obj): + sys.stdout.flush() + sys.stderr.flush() + self.returncode = None + self._fds = [] + self._launch(process_obj) + + def duplicate_for_child(self, fd): + self._fds.append(fd) + return reduction._mk_inheritable(fd) + + def poll(self, flag=os.WNOHANG): + if self.returncode is None: + while True: + try: + pid, sts = os.waitpid(self.pid, flag) + except OSError: + # Child process not yet created. See #1731717 + # e.errno == errno.ECHILD == 10 + return None + else: + break + if pid == self.pid: + if os.WIFSIGNALED(sts): + self.returncode = -os.WTERMSIG(sts) + else: + assert os.WIFEXITED(sts) + self.returncode = os.WEXITSTATUS(sts) + return self.returncode + + def wait(self, timeout=None): + if self.returncode is None: + if timeout is not None: + if not wait([self.sentinel], timeout): + return None + # This shouldn't block if wait() returned successfully. + return self.poll(os.WNOHANG if timeout == 0.0 else 0) + return self.returncode + + def terminate(self): + if self.returncode is None: + try: + os.kill(self.pid, signal.SIGTERM) + except ProcessLookupError: + pass + except OSError: + if self.wait(timeout=0.1) is None: + raise + + def _launch(self, process_obj): + + tracker_fd = resource_tracker._resource_tracker.getfd() + + fp = BytesIO() + set_spawning_popen(self) + try: + prep_data = spawn.get_preparation_data( + process_obj._name, + getattr(process_obj, "init_main_module", True)) + reduction.dump(prep_data, fp) + reduction.dump(process_obj, fp) + + finally: + set_spawning_popen(None) + + try: + parent_r, child_w = os.pipe() + child_r, parent_w = os.pipe() + # for fd in self._fds: + # _mk_inheritable(fd) + + cmd_python = [sys.executable] + cmd_python += ['-m', self.__module__] + cmd_python += ['--process-name', str(process_obj.name)] + cmd_python += ['--pipe', str(reduction._mk_inheritable(child_r))] + reduction._mk_inheritable(child_w) + reduction._mk_inheritable(tracker_fd) + self._fds += [child_r, child_w, tracker_fd] + if sys.version_info >= (3, 8) and os.name == 'posix': + mp_tracker_fd = prep_data['mp_tracker_args']['fd'] + self.duplicate_for_child(mp_tracker_fd) + + from .fork_exec import fork_exec + pid = fork_exec(cmd_python, self._fds, env=process_obj.env) + util.debug( + f"launched python with pid {pid} and cmd:\n{cmd_python}" + ) + self.sentinel = parent_r + + method = 'getbuffer' + if not hasattr(fp, method): + method = 'getvalue' + with os.fdopen(parent_w, 'wb') as f: + f.write(getattr(fp, method)()) + self.pid = pid + finally: + if parent_r is not None: + util.Finalize(self, os.close, (parent_r,)) + for fd in (child_r, child_w): + if fd is not None: + os.close(fd) + + @staticmethod + def thread_is_spawning(): + return True if __name__ == '__main__': @@ -187,8 +155,7 @@ def thread_is_spawning(): args = parser.parse_args() - info = dict() - + info = {} exitcode = 1 try: with os.fdopen(args.pipe, 'rb') as from_parent: @@ -203,7 +170,7 @@ def thread_is_spawning(): exitcode = process_obj._bootstrap() except Exception: print('\n\n' + '-' * 80) - print('{} failed with traceback: '.format(args.process_name)) + print(f'{args.process_name} failed with traceback: ') print('-' * 80) import traceback print(traceback.format_exc()) diff --git a/joblib/externals/loky/backend/popen_loky_win32.py b/joblib/externals/loky/backend/popen_loky_win32.py index 523bd078c..e2702a724 100644 --- a/joblib/externals/loky/backend/popen_loky_win32.py +++ b/joblib/externals/loky/backend/popen_loky_win32.py @@ -1,24 +1,15 @@ import os import sys +import msvcrt +import _winapi from pickle import load from multiprocessing import process, util +from multiprocessing.context import get_spawning_popen, set_spawning_popen +from multiprocessing.popen_spawn_win32 import Popen as _Popen +from multiprocessing.reduction import duplicate -from . import spawn -from . import reduction -from .context import get_spawning_popen, set_spawning_popen +from . import reduction, spawn -if sys.platform == "win32": - # Avoid import error by code introspection tools such as test runners - # trying to import this module while running on non-Windows systems. - import msvcrt - from .compat_win32 import _winapi - from .compat_win32 import Popen as _Popen - from .reduction import duplicate -else: - _Popen = object - -if sys.version_info[:2] < (3, 3): - from os import fdopen as open __all__ = ['Popen'] @@ -26,10 +17,6 @@ # # -TERMINATE = 0x10000 -WINEXE = (sys.platform == 'win32' and getattr(sys, 'frozen', False)) -WINSERVICE = sys.executable.lower().endswith("pythonservice.exe") - def _path_eq(p1, p2): return p1 == p2 or os.path.normcase(p1) == os.path.normcase(p2) @@ -61,13 +48,12 @@ def __init__(self, process_obj): os.close(rfd) cmd = get_command_line(parent_pid=os.getpid(), pipe_handle=rhandle) - cmd = ' '.join('"%s"' % x for x in cmd) + cmd = ' '.join(f'"{x}"' for x in cmd) python_exe = spawn.get_executable() # copy the environment variables to set in the child process - child_env = os.environ.copy() - child_env.update(process_obj.env) + child_env = {**os.environ, **process_obj.env} # bpo-35797: When running in a venv, we bypass the redirect # executor and launch our base Python. @@ -87,7 +73,7 @@ def __init__(self, process_obj): # the cleaner multiprocessing.reduction.steal_handle should # be used instead. inherit = True - hp, ht, pid, tid = _winapi.CreateProcess( + hp, ht, pid, _ = _winapi.CreateProcess( python_exe, cmd, None, None, inherit, 0, child_env, None, None) @@ -105,22 +91,19 @@ def __init__(self, process_obj): # send information to child set_spawning_popen(self) - if sys.version_info[:2] < (3, 4): - Popen._tls.process_handle = int(hp) try: reduction.dump(prep_data, to_child) reduction.dump(process_obj, to_child) finally: set_spawning_popen(None) - if sys.version_info[:2] < (3, 4): - del Popen._tls.process_handle except IOError as exc: # IOError 22 happens when the launched subprocess terminated before # wfd.close is called. Thus we can safely ignore it. if exc.errno != 22: raise - util.debug("While starting {}, ignored a IOError 22" - .format(process_obj._name)) + util.debug( + f"While starting {process_obj._name}, ignored a IOError 22" + ) def duplicate_for_child(self, handle): assert self is get_spawning_popen() @@ -132,12 +115,12 @@ def get_command_line(pipe_handle, **kwds): Returns prefix of command line used for spawning a child process ''' if getattr(sys, 'frozen', False): - return ([sys.executable, '--multiprocessing-fork', pipe_handle]) + return [sys.executable, '--multiprocessing-fork', pipe_handle] else: prog = 'from joblib.externals.loky.backend.popen_loky_win32 import main; main()' opts = util._args_from_interpreter_flags() - return [spawn.get_executable()] + opts + [ - '-c', prog, '--multiprocessing-fork', pipe_handle] + return [spawn.get_executable(), *opts, + '-c', prog, '--multiprocessing-fork', pipe_handle] def is_forking(argv): @@ -170,4 +153,4 @@ def main(): from_parent.close() exitcode = self._bootstrap() - exit(exitcode) + sys.exit(exitcode) diff --git a/joblib/externals/loky/backend/process.py b/joblib/externals/loky/backend/process.py index 30a20c061..eafde66d0 100644 --- a/joblib/externals/loky/backend/process.py +++ b/joblib/externals/loky/backend/process.py @@ -4,11 +4,10 @@ # authors: Thomas Moreau and Olivier Grisel # # based on multiprocessing/process.py (17/02/2017) -# * Add some compatibility function for python2.7 and 3.3 # -import os import sys -from .compat import BaseProcess +from multiprocessing.context import assert_spawning +from multiprocessing.process import BaseProcess class LokyProcess(BaseProcess): @@ -17,15 +16,9 @@ class LokyProcess(BaseProcess): def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, daemon=None, init_main_module=False, env=None): - if sys.version_info < (3, 3): - super(LokyProcess, self).__init__( - group=group, target=target, name=name, args=args, - kwargs=kwargs) - self.daemon = daemon - else: - super(LokyProcess, self).__init__( - group=group, target=target, name=name, args=args, - kwargs=kwargs, daemon=daemon) + super().__init__( + group=group, target=target, name=name, args=args, + kwargs=kwargs, daemon=daemon) self.env = {} if env is None else env self.authkey = self.authkey self.init_main_module = init_main_module @@ -38,55 +31,13 @@ def _Popen(process_obj): from .popen_loky_posix import Popen return Popen(process_obj) - if sys.version_info < (3, 3): - def start(self): - ''' - Start child process - ''' - from multiprocessing.process import _current_process, _cleanup - assert self._popen is None, 'cannot start a process twice' - assert self._parent_pid == os.getpid(), \ - 'can only start a process object created by current process' - _cleanup() - self._popen = self._Popen(self) - self._sentinel = self._popen.sentinel - _current_process._children.add(self) - - @property - def sentinel(self): - ''' - Return a file descriptor (Unix) or handle (Windows) suitable for - waiting for process termination. - ''' - try: - return self._sentinel - except AttributeError: - raise ValueError("process not started") - - if sys.version_info < (3, 4): - @property - def authkey(self): - return self._authkey - - @authkey.setter - def authkey(self, authkey): - ''' - Set authorization key of process - ''' - self._authkey = AuthenticationKey(authkey) - - def _bootstrap(self): - from .context import set_start_method - set_start_method(self._start_method) - super(LokyProcess, self)._bootstrap() - class LokyInitMainProcess(LokyProcess): _start_method = 'loky_init_main' def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, daemon=None): - super(LokyInitMainProcess, self).__init__( + super().__init__( group=group, target=target, name=name, args=args, kwargs=kwargs, daemon=daemon, init_main_module=True) @@ -97,7 +48,6 @@ def __init__(self, group=None, target=None, name=None, args=(), class AuthenticationKey(bytes): def __reduce__(self): - from .context import assert_spawning try: assert_spawning(self) except RuntimeError: diff --git a/joblib/externals/loky/backend/queues.py b/joblib/externals/loky/backend/queues.py index 62735db3a..4113b89fb 100644 --- a/joblib/externals/loky/backend/queues.py +++ b/joblib/externals/loky/backend/queues.py @@ -4,8 +4,6 @@ # authors: Thomas Moreau, Olivier Grisel # # based on multiprocessing/queues.py (16/02/2017) -# * Add some compatibility function for python2.7 and 3.3 and makes sure -# it uses the right synchronization primitive. # * Add some custom reducers for the Queues/SimpleQueue to tweak the # pickling process. (overload Queue._feed/SimpleQueue.put) # @@ -14,16 +12,16 @@ import errno import weakref import threading - from multiprocessing import util -from multiprocessing import connection -from multiprocessing.synchronize import SEM_VALUE_MAX -from multiprocessing.queues import Full -from multiprocessing.queues import _sentinel, Queue as mp_Queue -from multiprocessing.queues import SimpleQueue as mp_SimpleQueue +from multiprocessing.queues import ( + Full, + Queue as mp_Queue, + SimpleQueue as mp_SimpleQueue, + _sentinel, +) +from multiprocessing.context import assert_spawning -from .reduction import loads, dumps -from .context import assert_spawning, get_context +from .reduction import dumps __all__ = ['Queue', 'SimpleQueue', 'Full'] @@ -32,33 +30,7 @@ class Queue(mp_Queue): def __init__(self, maxsize=0, reducers=None, ctx=None): - - if sys.version_info[:2] >= (3, 4): - super().__init__(maxsize=maxsize, ctx=ctx) - else: - if maxsize <= 0: - # Can raise ImportError (see issues #3770 and #23400) - maxsize = SEM_VALUE_MAX - if ctx is None: - ctx = get_context() - self._maxsize = maxsize - self._reader, self._writer = connection.Pipe(duplex=False) - self._rlock = ctx.Lock() - self._opid = os.getpid() - if sys.platform == 'win32': - self._wlock = None - else: - self._wlock = ctx.Lock() - self._sem = ctx.BoundedSemaphore(maxsize) - - # For use by concurrent.futures - self._ignore_epipe = False - - self._after_fork() - - if sys.platform != 'win32': - util.register_after_fork(self, Queue._after_fork) - + super().__init__(maxsize=maxsize, ctx=ctx) self._reducers = reducers # Use custom queue set/get state to be able to reduce the custom reducers @@ -133,7 +105,7 @@ def _feed(buffer, notempty, send_bytes, writelock, close, reducers, else: wacquire = None - while 1: + while True: try: nacquire() try: @@ -142,7 +114,7 @@ def _feed(buffer, notempty, send_bytes, writelock, close, reducers, finally: nrelease() try: - while 1: + while True: obj = bpopleft() if obj is sentinel: util.debug('feeder thread got sentinel -- exiting') @@ -171,7 +143,7 @@ def _feed(buffer, notempty, send_bytes, writelock, close, reducers, # We ignore errors which happen after the process has # started to cleanup. if util.is_exiting(): - util.info('error in queue thread: %s', e) + util.info(f'error in queue thread: {e}') return else: queue_sem.release() @@ -185,29 +157,11 @@ def _on_queue_feeder_error(self, e, obj): import traceback traceback.print_exc() - if sys.version_info[:2] < (3, 4): - # Compat for python2.7/3.3 that use _send instead of _send_bytes - def _after_fork(self): - super(Queue, self)._after_fork() - self._send_bytes = self._writer.send_bytes - class SimpleQueue(mp_SimpleQueue): def __init__(self, reducers=None, ctx=None): - if sys.version_info[:2] >= (3, 4): - super().__init__(ctx=ctx) - else: - # Use the context to create the sync objects for python2.7/3.3 - if ctx is None: - ctx = get_context() - self._reader, self._writer = connection.Pipe(duplex=False) - self._rlock = ctx.Lock() - self._poll = self._reader.poll - if sys.platform == 'win32': - self._wlock = None - else: - self._wlock = ctx.Lock() + super().__init__(ctx=ctx) # Add possiblity to use custom reducers self._reducers = reducers @@ -226,15 +180,6 @@ def __setstate__(self, state): (self._reader, self._writer, self._reducers, self._rlock, self._wlock) = state - if sys.version_info[:2] < (3, 4): - # For python2.7/3.3, overload get to avoid creating deadlocks with - # unpickling errors. - def get(self): - with self._rlock: - res = self._reader.recv_bytes() - # unserialize the data after having released the lock - return loads(res) - # Overload put to use our customizable reducer def put(self, obj): # serialize the data before acquiring the lock diff --git a/joblib/externals/loky/backend/reduction.py b/joblib/externals/loky/backend/reduction.py index 4a2407c53..f1ee394bb 100644 --- a/joblib/externals/loky/backend/reduction.py +++ b/joblib/externals/loky/backend/reduction.py @@ -8,71 +8,28 @@ # * Add CustomizableLokyPickler to allow customizing pickling process # on the fly. # +import copyreg import io -import os -import sys import functools -from multiprocessing import util import types -try: - # Python 2 compat - from cPickle import loads as pickle_loads -except ImportError: - from pickle import loads as pickle_loads - import copyreg - -from pickle import HIGHEST_PROTOCOL - -if sys.platform == "win32": - if sys.version_info[:2] > (3, 3): - from multiprocessing.reduction import duplicate - else: - from multiprocessing.forking import duplicate +import sys +import os +from multiprocessing import util +from pickle import loads, HIGHEST_PROTOCOL ############################################################################### # Enable custom pickling in Loky. -# To allow instance customization of the pickling process, we use 2 classes. -# _ReducerRegistry gives module level customization and CustomizablePickler -# permits to use instance base custom reducers. Only CustomizablePickler -# should be used. - -class _ReducerRegistry(object): - """Registry for custom reducers. - HIGHEST_PROTOCOL is selected by default as this pickler is used - to pickle ephemeral datastructures for interprocess communication - hence no backward compatibility is required. +_dispatch_table = {} - """ - - # We override the pure Python pickler as its the only way to be able to - # customize the dispatch table without side effects in Python 2.6 - # to 3.2. For Python 3.3+ leverage the new dispatch_table - # feature from http://bugs.python.org/issue14166 that makes it possible - # to use the C implementation of the Pickler which is faster. - - dispatch_table = {} - - @classmethod - def register(cls, type, reduce_func): - """Attach a reducer function to a given type in the dispatch table.""" - if sys.version_info < (3,): - # Python 2 pickler dispatching is not explicitly customizable. - # Let us use a closure to workaround this limitation. - def dispatcher(cls, obj): - reduced = reduce_func(obj) - cls.save_reduce(obj=obj, *reduced) - cls.dispatch_table[type] = dispatcher - else: - cls.dispatch_table[type] = reduce_func +def register(type_, reduce_function): + _dispatch_table[type_] = reduce_function ############################################################################### # Registers extra pickling routines to improve picklization for loky -register = _ReducerRegistry.register - # make methods picklable def _reduce_method(m): @@ -157,15 +114,16 @@ def set_loky_pickler(loky_pickler=None): loky_pickler_cls = module_pickle.Pickler except (ImportError, AttributeError) as e: extra_info = ("\nThis error occurred while setting loky_pickler to" - " '{}', as required by the env variable LOKY_PICKLER" - " or the function set_loky_pickler." - .format(loky_pickler)) + f" '{loky_pickler}', as required by the env variable " + "LOKY_PICKLER or the function set_loky_pickler.") e.args = (e.args[0] + extra_info,) + e.args[1:] e.msg = e.args[0] raise e - util.debug("Using '{}' for serialization." - .format(loky_pickler if loky_pickler else "cloudpickle")) + util.debug( + f"Using '{loky_pickler if loky_pickler else 'cloudpickle'}' for " + "serialization." + ) class CustomizablePickler(loky_pickler_cls): _loky_pickler_cls = loky_pickler_cls @@ -195,43 +153,32 @@ def __init__(self, writer, reducers=None, protocol=HIGHEST_PROTOCOL): loky_pickler_cls.__init__(self, writer, protocol=protocol) if reducers is None: reducers = {} - if sys.version_info < (3,): - self.dispatch = loky_pickler_cls.dispatch.copy() - self.dispatch.update(_ReducerRegistry.dispatch_table) + + if hasattr(self, "dispatch_table"): + # Force a copy that we will update without mutating the + # any class level defined dispatch_table. + loky_dt = dict(self.dispatch_table) else: - if hasattr(self, "dispatch_table"): - # Force a copy that we will update without mutating the - # any class level defined dispatch_table. - loky_dt = dict(self.dispatch_table) - else: - # Use standard reducers as bases - loky_dt = copyreg.dispatch_table.copy() - - # Register loky specific reducers - loky_dt.update(_ReducerRegistry.dispatch_table) - - # Set the new dispatch table, taking care of the fact that we - # need to use the member_descriptor when we inherit from a - # subclass of the C implementation of the Pickler base class - # with an class level dispatch_table attribute. - self._set_dispatch_table(loky_dt) - - # Register custom reducers + # Use standard reducers as bases + loky_dt = copyreg.dispatch_table.copy() + + # Register loky specific reducers + loky_dt.update(_dispatch_table) + + # Set the new dispatch table, taking care of the fact that we + # need to use the member_descriptor when we inherit from a + # subclass of the C implementation of the Pickler base class + # with an class level dispatch_table attribute. + self._set_dispatch_table(loky_dt) + + # Register the reducers for type, reduce_func in reducers.items(): self.register(type, reduce_func) def register(self, type, reduce_func): """Attach a reducer function to a given type in the dispatch table. """ - if sys.version_info < (3,): - # Python 2 pickler dispatching is not explicitly customizable. - # Let us use a closure to workaround this limitation. - def dispatcher(self, obj): - reduced = reduce_func(obj) - self.save_reduce(obj=obj, *reduced) - self.dispatch[type] = dispatcher - else: - self.dispatch_table[type] = reduce_func + self.dispatch_table[type] = reduce_func _LokyPickler = CustomizablePickler _loky_pickler_name = loky_pickler @@ -251,13 +198,6 @@ def get_loky_pickler(): set_loky_pickler() -def loads(buf): - # Compat for python2.7 version - if sys.version_info < (3, 3) and isinstance(buf, io.BytesIO): - buf = buf.getvalue() - return pickle_loads(buf) - - def dump(obj, file, reducers=None, protocol=None): '''Replacement for pickle.dump() using _LokyPickler.''' global _LokyPickler @@ -269,12 +209,11 @@ def dumps(obj, reducers=None, protocol=None): buf = io.BytesIO() dump(obj, buf, reducers=reducers, protocol=protocol) - if sys.version_info < (3, 3): - return buf.getvalue() return buf.getbuffer() __all__ = ["dump", "dumps", "loads", "register", "set_loky_pickler"] if sys.platform == "win32": + from multiprocessing.reduction import duplicate __all__ += ["duplicate"] diff --git a/joblib/externals/loky/backend/resource_tracker.py b/joblib/externals/loky/backend/resource_tracker.py index 95dff35d0..d84504e14 100644 --- a/joblib/externals/loky/backend/resource_tracker.py +++ b/joblib/externals/loky/backend/resource_tracker.py @@ -6,9 +6,13 @@ # # adapted from multiprocessing/semaphore_tracker.py (17/02/2017) # * include custom spawnv_passfds to start the process -# * use custom unlink from our own SemLock implementation # * add some VERBOSE logging # +# TODO: multiprocessing.resource_tracker was contributed to Python 3.8 so +# once loky drops support for Python 3.7 it might be possible to stop +# maintaining this loky-specific fork. As a consequence, it might also be +# possible to stop maintaining the loky.backend.synchronize fork of +# multiprocessing.synchronize. # # On Unix we run a server process which keeps track of unlinked @@ -45,23 +49,16 @@ import signal import warnings import threading +from _multiprocessing import sem_unlink +from multiprocessing import util from . import spawn -from multiprocessing import util if sys.platform == "win32": - from .compat_win32 import _winapi - from .reduction import duplicate + import _winapi import msvcrt + from multiprocessing.reduction import duplicate -try: - from _multiprocessing import sem_unlink -except ImportError: - from .semlock import sem_unlink - -if sys.version_info < (3,): - BrokenPipeError = OSError - from os import fdopen as open __all__ = ['ensure_running', 'register', 'unregister'] @@ -80,7 +77,7 @@ VERBOSE = False -class ResourceTracker(object): +class ResourceTracker: def __init__(self): self._lock = threading.Lock() @@ -133,22 +130,13 @@ def ensure_running(self): os.close(r) r = _r - cmd = 'from {} import main; main({}, {})'.format( - main.__module__, r, VERBOSE) + cmd = f'from {main.__module__} import main; main({r}, {VERBOSE})' try: fds_to_pass.append(r) # process will out live us, so no need to wait on pid exe = spawn.get_executable() - args = [exe] + util._args_from_interpreter_flags() - # In python 3.3, there is a bug which put `-RRRRR..` instead of - # `-R` in args. Replace it to get the correct flags. - # See https://github.com/python/cpython/blob/3.3/Lib/subprocess.py#L488 - if sys.version_info[:2] <= (3, 3): - import re - for i in range(1, len(args)): - args[i] = re.sub("-R+", "-R", args[i]) - args += ['-c', cmd] - util.debug("launching resource tracker: {}".format(args)) + args = [exe, *util._args_from_interpreter_flags(), '-c', cmd] + util.debug(f"launching resource tracker: {args}") # bpo-33613: Register a signal mask that will block the # signals. This signal mask will be inherited by the child # that is going to be spawned and will protect the child from a @@ -201,11 +189,11 @@ def maybe_unlink(self, name, rtype): self._send("MAYBE_UNLINK", name, rtype) def _send(self, cmd, name, rtype): - msg = '{0}:{1}:{2}\n'.format(cmd, name, rtype).encode('ascii') if len(name) > 512: # posix guarantees that writes to a pipe of less than PIPE_BUF # bytes are atomic, and that PIPE_BUF >= 512 raise ValueError('name too long') + msg = f'{cmd}:{name}:{rtype}\n'.encode('ascii') nbytes = os.write(self._fd, msg) assert nbytes == len(msg) @@ -239,7 +227,7 @@ def main(fd, verbose=0): if verbose: util.debug("Main resource tracker is running") - registry = {rtype: dict() for rtype in _CLEANUP_FUNCS.keys()} + registry = {rtype: {} for rtype in _CLEANUP_FUNCS.keys()} try: # keep track of registered/unregistered resources if sys.platform == "win32": @@ -261,10 +249,11 @@ def main(fd, verbose=0): if rtype not in _CLEANUP_FUNCS: raise ValueError( - 'Cannot register {} for automatic cleanup: ' - 'unknown resource type ({}). Resource type should ' - 'be one of the following: {}'.format( - name, rtype, list(_CLEANUP_FUNCS.keys()))) + f'Cannot register {name} for automatic cleanup: ' + f'unknown resource type ({rtype}). Resource type ' + 'should be one of the following: ' + f'{list(_CLEANUP_FUNCS.keys())}' + ) if cmd == 'REGISTER': if name not in registry[rtype]: @@ -274,37 +263,40 @@ def main(fd, verbose=0): if verbose: util.debug( - "[ResourceTracker] incremented refcount of {} " - "{} (current {})".format( - rtype, name, registry[rtype][name])) + "[ResourceTracker] incremented refcount of " + f"{rtype} {name} " + f"(current {registry[rtype][name]})" + ) elif cmd == 'UNREGISTER': del registry[rtype][name] if verbose: util.debug( - "[ResourceTracker] unregister {} {}: " - "registry({})".format(name, rtype, len(registry))) + f"[ResourceTracker] unregister {name} {rtype}: " + f"registry({len(registry)})" + ) elif cmd == 'MAYBE_UNLINK': registry[rtype][name] -= 1 if verbose: util.debug( - "[ResourceTracker] decremented refcount of {} " - "{} (current {})".format( - rtype, name, registry[rtype][name])) + "[ResourceTracker] decremented refcount of " + f"{rtype} {name} " + f"(current {registry[rtype][name]})" + ) if registry[rtype][name] == 0: del registry[rtype][name] try: if verbose: util.debug( - "[ResourceTracker] unlink {}" - .format(name)) + f"[ResourceTracker] unlink {name}" + ) _CLEANUP_FUNCS[rtype](name) except Exception as e: warnings.warn( - 'resource_tracker: %s: %r' % (name, e)) + f'resource_tracker: {name}: {e!r}') else: - raise RuntimeError('unrecognized command %r' % cmd) + raise RuntimeError(f'unrecognized command {cmd!r}') except BaseException: try: sys.excepthook(*sys.exc_info()) @@ -315,9 +307,11 @@ def main(fd, verbose=0): def _unlink_resources(rtype_registry, rtype): if rtype_registry: try: - warnings.warn('resource_tracker: There appear to be %d ' - 'leaked %s objects to clean up at shutdown' % - (len(rtype_registry), rtype)) + warnings.warn( + 'resource_tracker: There appear to be ' + f'{len(rtype_registry)} leaked {rtype} objects to ' + 'clean up at shutdown' + ) except Exception: pass for name in rtype_registry: @@ -327,10 +321,9 @@ def _unlink_resources(rtype_registry, rtype): try: _CLEANUP_FUNCS[rtype](name) if verbose: - util.debug("[ResourceTracker] unlink {}" - .format(name)) + util.debug(f"[ResourceTracker] unlink {name}") except Exception as e: - warnings.warn('resource_tracker: %s: %r' % (name, e)) + warnings.warn(f'resource_tracker: {name}: {e!r}') for rtype, rtype_registry in registry.items(): if rtype == "folder": @@ -361,18 +354,16 @@ def spawnv_passfds(path, args, passfds): errpipe_read, errpipe_write = os.pipe() try: from .reduction import _mk_inheritable - _pass = [] - for fd in passfds: - _pass += [_mk_inheritable(fd)] from .fork_exec import fork_exec + _pass = [_mk_inheritable(fd) for fd in passfds] return fork_exec(args, _pass) finally: os.close(errpipe_read) os.close(errpipe_write) else: - cmd = ' '.join('"%s"' % x for x in args) + cmd = ' '.join(f'"{x}"' for x in args) try: - hp, ht, pid, tid = _winapi.CreateProcess( + _, ht, pid, _ = _winapi.CreateProcess( path, cmd, None, None, True, 0, None, None, None) _winapi.CloseHandle(ht) except BaseException: diff --git a/joblib/externals/loky/backend/semlock.py b/joblib/externals/loky/backend/semlock.py deleted file mode 100644 index 2d35f6a27..000000000 --- a/joblib/externals/loky/backend/semlock.py +++ /dev/null @@ -1,274 +0,0 @@ -############################################################################### -# Ctypes implementation for posix semaphore. -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from cpython/Modules/_multiprocessing/semaphore.c (17/02/2017) -# * use ctypes to access pthread semaphores and provide a full python -# semaphore management. -# * For OSX, as no sem_getvalue is not implemented, Semaphore with value > 1 -# are not guaranteed to work. -# * Only work with LokyProcess on posix -# -import os -import sys -import time -import errno -import ctypes -import tempfile -import threading -from ctypes.util import find_library - -# As we need to use ctypes return types for semlock object, failure value -# needs to be cast to proper python value. Unix failure convention is to -# return 0, whereas OSX returns -1 -SEM_FAILURE = ctypes.c_void_p(0).value -if sys.platform == 'darwin': - SEM_FAILURE = ctypes.c_void_p(-1).value - -# Semaphore types -RECURSIVE_MUTEX = 0 -SEMAPHORE = 1 - -# Semaphore constants -SEM_OFLAG = ctypes.c_int(os.O_CREAT | os.O_EXCL) -SEM_PERM = ctypes.c_int(384) - - -class timespec(ctypes.Structure): - _fields_ = [("tv_sec", ctypes.c_long), ("tv_nsec", ctypes.c_long)] - - -if sys.platform != 'win32': - pthread = ctypes.CDLL(find_library('pthread'), use_errno=True) - pthread.sem_open.restype = ctypes.c_void_p - pthread.sem_close.argtypes = [ctypes.c_void_p] - pthread.sem_wait.argtypes = [ctypes.c_void_p] - pthread.sem_trywait.argtypes = [ctypes.c_void_p] - pthread.sem_post.argtypes = [ctypes.c_void_p] - pthread.sem_getvalue.argtypes = [ctypes.c_void_p, ctypes.c_void_p] - pthread.sem_unlink.argtypes = [ctypes.c_char_p] - if sys.platform != "darwin": - pthread.sem_timedwait.argtypes = [ctypes.c_void_p, - ctypes.POINTER(timespec)] - -try: - from threading import get_ident -except ImportError: - def get_ident(): - return threading.current_thread().ident - - -if sys.version_info[:2] < (3, 3): - class FileExistsError(OSError): - pass - - class FileNotFoundError(OSError): - pass - - -def sem_unlink(name): - if pthread.sem_unlink(name.encode('ascii')) < 0: - raiseFromErrno() - - -def _sem_open(name, value=None): - """ Construct or retrieve a semaphore with the given name - - If value is None, try to retrieve an existing named semaphore. - Else create a new semaphore with the given value - """ - if value is None: - handle = pthread.sem_open(ctypes.c_char_p(name), 0) - else: - handle = pthread.sem_open(ctypes.c_char_p(name), SEM_OFLAG, SEM_PERM, - ctypes.c_int(value)) - - if handle == SEM_FAILURE: - e = ctypes.get_errno() - if e == errno.EEXIST: - raise FileExistsError("a semaphore named %s already exists" % name) - elif e == errno.ENOENT: - raise FileNotFoundError('cannot find semaphore named %s' % name) - elif e == errno.ENOSYS: - raise NotImplementedError('No semaphore implementation on this ' - 'system') - else: - raiseFromErrno() - - return handle - - -def _sem_timedwait(handle, timeout): - t_start = time.time() - if sys.platform != "darwin": - sec = int(timeout) - tv_sec = int(t_start) - nsec = int(1e9 * (timeout - sec) + .5) - tv_nsec = int(1e9 * (t_start - tv_sec) + .5) - deadline = timespec(sec+tv_sec, nsec+tv_nsec) - deadline.tv_sec += int(deadline.tv_nsec / 1000000000) - deadline.tv_nsec %= 1000000000 - return pthread.sem_timedwait(handle, ctypes.pointer(deadline)) - - # PERFORMANCE WARNING - # No sem_timedwait on OSX so we implement our own method. This method can - # degrade performances has the wait can have a latency up to 20 msecs - deadline = t_start + timeout - delay = 0 - now = time.time() - while True: - # Poll the sem file - res = pthread.sem_trywait(handle) - if res == 0: - return 0 - else: - e = ctypes.get_errno() - if e != errno.EAGAIN: - raiseFromErrno() - - # check for timeout - now = time.time() - if now > deadline: - ctypes.set_errno(errno.ETIMEDOUT) - return -1 - - # calculate how much time left and check the delay is not too long - # -- maximum is 20 msecs - difference = (deadline - now) - delay = min(delay, 20e-3, difference) - - # Sleep and increase delay - time.sleep(delay) - delay += 1e-3 - - -class SemLock(object): - """ctypes wrapper to the unix semaphore""" - - _rand = tempfile._RandomNameSequence() - - def __init__(self, kind, value, maxvalue, name=None, unlink_now=False): - self.count = 0 - self.ident = 0 - self.kind = kind - self.maxvalue = maxvalue - self.name = name - self.handle = _sem_open(self.name.encode('ascii'), value) - - def __del__(self): - try: - res = pthread.sem_close(self.handle) - assert res == 0, "Issue while closing semaphores" - except AttributeError: - pass - - def _is_mine(self): - return self.count > 0 and get_ident() == self.ident - - def acquire(self, block=True, timeout=None): - if self.kind == RECURSIVE_MUTEX and self._is_mine(): - self.count += 1 - return True - - if block and timeout is None: - res = pthread.sem_wait(self.handle) - elif not block or timeout <= 0: - res = pthread.sem_trywait(self.handle) - else: - res = _sem_timedwait(self.handle, timeout) - if res < 0: - e = ctypes.get_errno() - if e == errno.EINTR: - return None - elif e in [errno.EAGAIN, errno.ETIMEDOUT]: - return False - raiseFromErrno() - self.count += 1 - self.ident = get_ident() - return True - - def release(self): - if self.kind == RECURSIVE_MUTEX: - assert self._is_mine(), ( - "attempt to release recursive lock not owned by thread") - if self.count > 1: - self.count -= 1 - return - assert self.count == 1 - else: - if sys.platform == 'darwin': - # Handle broken get_value for mac ==> only Lock will work - # as sem_get_value do not work properly - if self.maxvalue == 1: - if pthread.sem_trywait(self.handle) < 0: - e = ctypes.get_errno() - if e != errno.EAGAIN: - raise OSError(e, errno.errorcode[e]) - else: - if pthread.sem_post(self.handle) < 0: - raiseFromErrno() - else: - raise ValueError( - "semaphore or lock released too many times") - else: - import warnings - warnings.warn("semaphore are broken on OSX, release might " - "increase its maximal value", RuntimeWarning) - else: - value = self._get_value() - if value >= self.maxvalue: - raise ValueError( - "semaphore or lock released too many times") - - if pthread.sem_post(self.handle) < 0: - raiseFromErrno() - - self.count -= 1 - - def _get_value(self): - value = ctypes.pointer(ctypes.c_int(-1)) - if pthread.sem_getvalue(self.handle, value) < 0: - raiseFromErrno() - return value.contents.value - - def _count(self): - return self.count - - def _is_zero(self): - if sys.platform == 'darwin': - # Handle broken get_value for mac ==> only Lock will work - # as sem_get_value do not work properly - if pthread.sem_trywait(self.handle) < 0: - e = ctypes.get_errno() - if e == errno.EAGAIN: - return True - raise OSError(e, errno.errorcode[e]) - else: - if pthread.sem_post(self.handle) < 0: - raiseFromErrno() - return False - else: - value = ctypes.pointer(ctypes.c_int(-1)) - if pthread.sem_getvalue(self.handle, value) < 0: - raiseFromErrno() - return value.contents.value == 0 - - def _after_fork(self): - self.count = 0 - - @staticmethod - def _rebuild(handle, kind, maxvalue, name): - self = SemLock.__new__(SemLock) - self.count = 0 - self.ident = 0 - self.kind = kind - self.maxvalue = maxvalue - self.name = name - self.handle = _sem_open(name.encode('ascii')) - return self - - -def raiseFromErrno(): - e = ctypes.get_errno() - raise OSError(e, errno.errorcode[e]) diff --git a/joblib/externals/loky/backend/spawn.py b/joblib/externals/loky/backend/spawn.py index 2a16c844b..3a9cc2dd1 100644 --- a/joblib/externals/loky/backend/spawn.py +++ b/joblib/externals/loky/backend/spawn.py @@ -18,7 +18,7 @@ WINSERVICE = False else: import msvcrt - from .reduction import duplicate + from multiprocessing.reduction import duplicate WINEXE = (sys.platform == 'win32' and getattr(sys, 'frozen', False)) WINSERVICE = sys.executable.lower().endswith("pythonservice.exe") @@ -65,19 +65,12 @@ def get_preparation_data(name, init_main_module=True): ) # Send sys_path and make sure the current directory will not be changed - sys_path = [p for p in sys.path] - try: - i = sys_path.index('') - except ValueError: - pass - else: - sys_path[i] = process.ORIGINAL_DIR - d['sys_path'] = sys_path + d['sys_path'] = [p if p != '' else process.ORIGINAL_DIR for p in sys.path] # Make sure to pass the information if the multiprocessing logger is active if util._logger is not None: d['log_level'] = util._logger.getEffectiveLevel() - if len(util._logger.handlers) > 0: + if util._logger.handlers: h = util._logger.handlers[0] d['log_fmt'] = h.formatter._fmt @@ -129,8 +122,6 @@ def get_preparation_data(name, init_main_module=True): process.ORIGINAL_DIR is not None): main_path = os.path.join(process.ORIGINAL_DIR, main_path) d['init_main_from_path'] = os.path.normpath(main_path) - # Compat for python2.7 - d['main_path'] = d['init_main_from_path'] return d @@ -249,10 +240,3 @@ def _fixup_main_from_path(main_path): run_name="__mp_main__") main_module.__dict__.update(main_content) sys.modules['__main__'] = sys.modules['__mp_main__'] = main_module - - -def import_main_path(main_path): - ''' - Set sys.modules['__main__'] to module at main_path - ''' - _fixup_main_from_path(main_path) diff --git a/joblib/externals/loky/backend/synchronize.py b/joblib/externals/loky/backend/synchronize.py index 592de3c02..a9518a880 100644 --- a/joblib/externals/loky/backend/synchronize.py +++ b/joblib/externals/loky/backend/synchronize.py @@ -5,10 +5,12 @@ # # adapted from multiprocessing/synchronize.py (17/02/2017) # * Remove ctx argument for compatibility reason -# * Implementation of Condition/Event are necessary for compatibility -# with python2.7/3.3, Barrier should be reimplemented to for those -# version (but it is not used in loky). +# * Registers a cleanup function with the loky resource_tracker to remove the +# semaphore when the process dies instead. # +# TODO: investigate which Python version is required to be able to use +# multiprocessing.resource_tracker and therefore multiprocessing.synchronize +# instead of a loky-specific fork. import os import sys @@ -16,11 +18,10 @@ import threading import _multiprocessing from time import time as _time +from multiprocessing import process, util +from multiprocessing.context import assert_spawning -from .context import assert_spawning from . import resource_tracker -from multiprocessing import process -from multiprocessing import util __all__ = [ 'Lock', 'RLock', 'Semaphore', 'BoundedSemaphore', 'Condition', 'Event' @@ -29,26 +30,19 @@ # raise ImportError for platforms lacking a working sem_open implementation. # See issue 3770 try: - if sys.version_info < (3, 4): - from .semlock import SemLock as _SemLock - from .semlock import sem_unlink - else: - from _multiprocessing import SemLock as _SemLock - from _multiprocessing import sem_unlink -except (ImportError): + from _multiprocessing import SemLock as _SemLock + from _multiprocessing import sem_unlink +except ImportError: raise ImportError("This platform lacks a functioning sem_open" + " implementation, therefore, the required" + " synchronization primitives needed will not" + " function, see issue 3770.") -if sys.version_info[:2] < (3, 3): - FileExistsError = OSError - # # Constants # -RECURSIVE_MUTEX, SEMAPHORE = list(range(2)) +RECURSIVE_MUTEX, SEMAPHORE = range(2) SEM_VALUE_MAX = _multiprocessing.SemLock.SEM_VALUE_MAX @@ -56,27 +50,35 @@ # Base class for semaphores and mutexes; wraps `_multiprocessing.SemLock` # -class SemLock(object): +class SemLock: _rand = tempfile._RandomNameSequence() - def __init__(self, kind, value, maxvalue): + def __init__(self, kind, value, maxvalue, name=None): # unlink_now is only used on win32 or when we are using fork. unlink_now = False - for i in range(100): - try: - self._semlock = _SemLock( - kind, value, maxvalue, SemLock._make_name(), - unlink_now) - except FileExistsError: # pragma: no cover - pass - else: - break - else: # pragma: no cover - raise FileExistsError('cannot find name for semaphore') - - util.debug('created semlock with handle %s and name "%s"' - % (self._semlock.handle, self._semlock.name)) + if name is None: + # Try to find an unused name for the SemLock instance. + for _ in range(100): + try: + self._semlock = _SemLock( + kind, value, maxvalue, SemLock._make_name(), unlink_now + ) + except FileExistsError: # pragma: no cover + pass + else: + break + else: # pragma: no cover + raise FileExistsError('cannot find name for semaphore') + else: + self._semlock = _SemLock( + kind, value, maxvalue, name, unlink_now + ) + self.name = name + util.debug( + f'created semlock with handle {self._semlock.handle} and name ' + f'"{self.name}"' + ) self._make_methods() @@ -93,8 +95,14 @@ def _after_fork(obj): @staticmethod def _cleanup(name): - sem_unlink(name) - resource_tracker.unregister(name, "semlock") + try: + sem_unlink(name) + except FileNotFoundError: + # Already unlinked, possibly by user code: ignore and make sure to + # unregister the semaphore from the resource tracker. + pass + finally: + resource_tracker.unregister(name, "semlock") def _make_methods(self): self.acquire = self._semlock.acquire @@ -114,14 +122,15 @@ def __getstate__(self): def __setstate__(self, state): self._semlock = _SemLock._rebuild(*state) - util.debug('recreated blocker with handle %r and name "%s"' - % (state[0], state[3])) + util.debug( + f'recreated blocker with handle {state[0]!r} and name "{state[3]}"' + ) self._make_methods() @staticmethod def _make_name(): # OSX does not support long names for semaphores - return '/loky-%i-%s' % (os.getpid(), next(SemLock._rand)) + return f'/loky-{os.getpid()}-{next(SemLock._rand)}' # @@ -143,7 +152,7 @@ def __repr__(self): value = self._semlock._get_value() except Exception: value = 'unknown' - return '<%s(value=%s)>' % (self.__class__.__name__, value) + return f'<{self.__class__.__name__}(value={value})>' # @@ -160,8 +169,10 @@ def __repr__(self): value = self._semlock._get_value() except Exception: value = 'unknown' - return '<%s(value=%s, maxvalue=%s)>' % \ - (self.__class__.__name__, value, self._semlock.maxvalue) + return ( + f'<{self.__class__.__name__}(value={value}, ' + f'maxvalue={self._semlock.maxvalue})>' + ) # @@ -171,14 +182,14 @@ def __repr__(self): class Lock(SemLock): def __init__(self): - super(Lock, self).__init__(SEMAPHORE, 1, 1) + super().__init__(SEMAPHORE, 1, 1) def __repr__(self): try: if self._semlock._is_mine(): name = process.current_process().name if threading.current_thread().name != 'MainThread': - name += '|' + threading.current_thread().name + name = f'{name}|{threading.current_thread().name}' elif self._semlock._get_value() == 1: name = 'None' elif self._semlock._count() > 0: @@ -187,7 +198,7 @@ def __repr__(self): name = 'SomeOtherProcess' except Exception: name = 'unknown' - return '<%s(owner=%s)>' % (self.__class__.__name__, name) + return f'<{self.__class__.__name__}(owner={name})>' # @@ -197,14 +208,14 @@ def __repr__(self): class RLock(SemLock): def __init__(self): - super(RLock, self).__init__(RECURSIVE_MUTEX, 1, 1) + super().__init__(RECURSIVE_MUTEX, 1, 1) def __repr__(self): try: if self._semlock._is_mine(): name = process.current_process().name if threading.current_thread().name != 'MainThread': - name += '|' + threading.current_thread().name + name = f'{name}|{threading.current_thread().name}' count = self._semlock._count() elif self._semlock._get_value() == 1: name, count = 'None', 0 @@ -214,14 +225,14 @@ def __repr__(self): name, count = 'SomeOtherProcess', 'nonzero' except Exception: name, count = 'unknown', 'unknown' - return '<%s(%s, %s)>' % (self.__class__.__name__, name, count) + return f'<{self.__class__.__name__}({name}, {count})>' # # Condition variable # -class Condition(object): +class Condition: def __init__(self, lock=None): self._lock = lock or RLock() @@ -256,8 +267,7 @@ def __repr__(self): self._woken_count._semlock._get_value()) except Exception: num_waiters = 'unknown' - return '<%s(%s, %s)>' % (self.__class__.__name__, - self._lock, num_waiters) + return f'<{self.__class__.__name__}({self._lock}, {num_waiters})>' def wait(self, timeout=None): assert self._lock._semlock._is_mine(), \ @@ -268,7 +278,7 @@ def wait(self, timeout=None): # release lock count = self._lock._semlock._count() - for i in range(count): + for _ in range(count): self._lock.release() try: @@ -279,7 +289,7 @@ def wait(self, timeout=None): self._woken_count.release() # reacquire lock - for i in range(count): + for _ in range(count): self._lock.acquire() def notify(self): @@ -315,7 +325,7 @@ def notify_all(self): sleepers += 1 if sleepers: - for i in range(sleepers): + for _ in range(sleepers): self._woken_count.acquire() # wait for a sleeper to wake # rezero wait_semaphore in case some timeouts just happened @@ -345,7 +355,7 @@ def wait_for(self, predicate, timeout=None): # Event # -class Event(object): +class Event: def __init__(self): self._cond = Condition(Lock()) diff --git a/joblib/externals/loky/backend/utils.py b/joblib/externals/loky/backend/utils.py index dc1b82af2..2956614e4 100644 --- a/joblib/externals/loky/backend/utils.py +++ b/joblib/externals/loky/backend/utils.py @@ -4,116 +4,125 @@ import errno import signal import warnings -import threading import subprocess +import traceback try: import psutil except ImportError: psutil = None -WIN32 = sys.platform == "win32" - - -def _flag_current_thread_clean_exit(): - """Put a ``_clean_exit`` flag on the current thread""" - thread = threading.current_thread() - thread._clean_exit = True +def kill_process_tree(process, use_psutil=True): + """Terminate process and its descendants with SIGKILL""" + if use_psutil and psutil is not None: + _kill_process_tree_with_psutil(process) + else: + _kill_process_tree_without_psutil(process) def recursive_terminate(process, use_psutil=True): - if use_psutil and psutil is not None: - _recursive_terminate_with_psutil(process) - else: - _recursive_terminate_without_psutil(process) + warnings.warn( + "recursive_terminate is deprecated in loky 3.2, use kill_process_tree" + "instead", + DeprecationWarning, + ) + kill_process_tree(process, use_psutil=use_psutil) -def _recursive_terminate_with_psutil(process, retries=5): +def _kill_process_tree_with_psutil(process): try: - children = psutil.Process(process.pid).children(recursive=True) + descendants = psutil.Process(process.pid).children(recursive=True) except psutil.NoSuchProcess: return - # Kill the children in reverse order to avoid killing the parents before - # the children in cases where there are more processes nested. - for child in children[::-1]: + # Kill the descendants in reverse order to avoid killing the parents before + # the descendant in cases where there are more processes nested. + for descendant in descendants[::-1]: try: - child.kill() + descendant.kill() except psutil.NoSuchProcess: pass - process.terminate() + try: + psutil.Process(process.pid).kill() + except psutil.NoSuchProcess: + pass process.join() -def _recursive_terminate_without_psutil(process): - """Terminate a process and its descendants. - """ +def _kill_process_tree_without_psutil(process): + """Terminate a process and its descendants.""" try: - _recursive_terminate(process.pid) - except OSError as e: - warnings.warn("Failed to kill subprocesses on this platform. Please" - "install psutil: https://github.com/giampaolo/psutil") - # In case we cannot introspect the children, we fall back to the - # classic Process.terminate. - process.terminate() + if sys.platform == "win32": + _windows_taskkill_process_tree(process.pid) + else: + _posix_recursive_kill(process.pid) + except Exception: # pragma: no cover + details = traceback.format_exc() + warnings.warn( + "Failed to kill subprocesses on this platform. Please install" + "psutil: https://github.com/giampaolo/psutil\n" + f"Details:\n{details}" + ) + # In case we cannot introspect or kill the descendants, we fall back to + # only killing the main process. + # + # Note: on Windows, process.kill() is an alias for process.terminate() + # which in turns calls the Win32 API function TerminateProcess(). + process.kill() process.join() -def _recursive_terminate(pid): - """Recursively kill the descendants of a process before killing it. - """ +def _windows_taskkill_process_tree(pid): + # On windows, the taskkill function with option `/T` terminate a given + # process pid and its children. + try: + subprocess.check_output( + ["taskkill", "/F", "/T", "/PID", str(pid)], stderr=None + ) + except subprocess.CalledProcessError as e: + # In Windows, taskkill returns 128, 255 for no process found. + if e.returncode not in [128, 255]: + # Let's raise to let the caller log the error details in a + # warning and only kill the root process. + raise # pragma: no cover + + +def _kill(pid): + # Not all systems (e.g. Windows) have a SIGKILL, but the C specification + # mandates a SIGTERM signal. While Windows is handled specifically above, + # let's try to be safe for other hypothetic platforms that only have + # SIGTERM without SIGKILL. + kill_signal = getattr(signal, 'SIGKILL', signal.SIGTERM) + try: + os.kill(pid, kill_signal) + except OSError as e: + # if OSError is raised with [Errno 3] no such process, the process + # is already terminated, else, raise the error and let the top + # level function raise a warning and retry to kill the process. + if e.errno != errno.ESRCH: + raise # pragma: no cover - if sys.platform == "win32": - # On windows, the taskkill function with option `/T` terminate a given - # process pid and its children. - try: - subprocess.check_output( - ["taskkill", "/F", "/T", "/PID", str(pid)], - stderr=None) - except subprocess.CalledProcessError as e: - # In windows, taskkill return 1 for permission denied and 128, 255 - # for no process found. - if e.returncode not in [1, 128, 255]: - raise - elif e.returncode == 1: - # Try to kill the process without its descendants if taskkill - # was denied permission. If this fails too, with an error - # different from process not found, let the top level function - # raise a warning and retry to kill the process. - try: - os.kill(pid, signal.SIGTERM) - except OSError as e: - if e.errno != errno.ESRCH: - raise - else: - try: - children_pids = subprocess.check_output( - ["pgrep", "-P", str(pid)], - stderr=None - ) - except subprocess.CalledProcessError as e: - # `ps` returns 1 when no child process has been found - if e.returncode == 1: - children_pids = b'' - else: - raise - - # Decode the result, split the cpid and remove the trailing line - children_pids = children_pids.decode().split('\n')[:-1] - for cpid in children_pids: - cpid = int(cpid) - _recursive_terminate(cpid) +def _posix_recursive_kill(pid): + """Recursively kill the descendants of a process before killing it.""" + try: + children_pids = subprocess.check_output( + ["pgrep", "-P", str(pid)], stderr=None, text=True + ) + except subprocess.CalledProcessError as e: + # `ps` returns 1 when no child process has been found + if e.returncode == 1: + children_pids = '' + else: + raise # pragma: no cover - try: - os.kill(pid, signal.SIGTERM) - except OSError as e: - # if OSError is raised with [Errno 3] no such process, the process - # is already terminated, else, raise the error and let the top - # level function raise a warning and retry to kill the process. - if e.errno != errno.ESRCH: - raise + # Decode the result, split the cpid and remove the trailing line + for cpid in children_pids.splitlines(): + cpid = int(cpid) + _posix_recursive_kill(cpid) + + _kill(pid) def get_exitcodes_terminated_worker(processes): @@ -129,7 +138,7 @@ def get_exitcodes_terminated_worker(processes): # the terminated worker. exitcodes = [p.exitcode for p in list(processes.values()) if p.exitcode is not None] - while len(exitcodes) == 0 and patience > 0: + while not exitcodes and patience > 0: patience -= 1 exitcodes = [p.exitcode for p in list(processes.values()) if p.exitcode is not None] @@ -140,7 +149,7 @@ def get_exitcodes_terminated_worker(processes): def _format_exitcodes(exitcodes): """Format a list of exit code with names of the signals if possible""" - str_exitcodes = ["{}({})".format(_get_exitcode_name(e), e) + str_exitcodes = [f"{_get_exitcode_name(e)}({e})" for e in exitcodes if e is not None] return "{" + ", ".join(str_exitcodes) + "}" @@ -154,14 +163,7 @@ def _get_exitcode_name(exitcode): if exitcode < 0: try: import signal - if sys.version_info > (3, 5): - return signal.Signals(-exitcode).name - - # construct an inverse lookup table - for v, k in signal.__dict__.items(): - if (v.startswith('SIG') and not v.startswith('SIG_') and - k == -exitcode): - return v + return signal.Signals(-exitcode).name except ValueError: return "UNKNOWN" elif exitcode != 255: diff --git a/joblib/externals/loky/cloudpickle_wrapper.py b/joblib/externals/loky/cloudpickle_wrapper.py index 1bf41a336..0b187e84e 100644 --- a/joblib/externals/loky/cloudpickle_wrapper.py +++ b/joblib/externals/loky/cloudpickle_wrapper.py @@ -1,17 +1,12 @@ import inspect from functools import partial +from joblib.externals.cloudpickle import dumps, loads -try: - from joblib.externals.cloudpickle import dumps, loads - cloudpickle = True -except ImportError: - cloudpickle = False +WRAP_CACHE = {} -WRAP_CACHE = dict() - -class CloudpickledObjectWrapper(object): +class CloudpickledObjectWrapper: def __init__(self, obj, keep_wrapper=False): self._obj = obj self._keep_wrapper = keep_wrapper @@ -52,9 +47,6 @@ def _reconstruct_wrapper(_pickled_object, keep_wrapper): def _wrap_objects_when_needed(obj): # Function to introspect an object and decide if it should be wrapped or # not. - if not cloudpickle: - return obj - need_wrap = "__main__" in getattr(obj, "__module__", "") if isinstance(obj, partial): return partial( @@ -92,11 +84,6 @@ def wrap_non_picklable_objects(obj, keep_wrapper=True): objects in the main scripts and to implement __reduce__ functions for complex classes. """ - if not cloudpickle: - raise ImportError("could not from joblib.externals import cloudpickle. Please install " - "cloudpickle to allow extended serialization. " - "(`pip install cloudpickle`).") - # If obj is a class, create a CloudpickledClassWrapper which instantiates # the object internally and wrap it directly in a CloudpickledObjectWrapper if inspect.isclass(obj): diff --git a/joblib/externals/loky/initializers.py b/joblib/externals/loky/initializers.py new file mode 100644 index 000000000..cc4b7b17c --- /dev/null +++ b/joblib/externals/loky/initializers.py @@ -0,0 +1,76 @@ +import warnings + + +def _viztracer_init(init_kwargs): + """Initialize viztracer's profiler in worker processes""" + from viztracer import VizTracer + tracer = VizTracer(**init_kwargs) + tracer.register_exit() + tracer.start() + + +def _make_viztracer_initializer_and_initargs(): + try: + import viztracer + tracer = viztracer.get_tracer() + if tracer is not None and getattr(tracer, 'enable', False): + # Profiler is active: introspect its configuration to + # initialize the workers with the same configuration. + return _viztracer_init, (tracer.init_kwargs,) + except ImportError: + # viztracer is not installed: nothing to do + pass + except Exception as e: + # In case viztracer's API evolve, we do not want to crash loky but + # we want to know about it to be able to update loky. + warnings.warn(f"Unable to introspect viztracer state: {e}") + return None, () + + +class _ChainedInitializer: + """Compound worker initializer + + This is meant to be used in conjunction with _chain_initializers to + produce the necessary chained_args list to be passed to __call__. + """ + + def __init__(self, initializers): + self._initializers = initializers + + def __call__(self, *chained_args): + for initializer, args in zip(self._initializers, chained_args): + initializer(*args) + + +def _chain_initializers(initializer_and_args): + """Convenience helper to combine a sequence of initializers. + + If some initializers are None, they are filtered out. + """ + filtered_initializers = [] + filtered_initargs = [] + for initializer, initargs in initializer_and_args: + if initializer is not None: + filtered_initializers.append(initializer) + filtered_initargs.append(initargs) + + if not filtered_initializers: + return None, () + elif len(filtered_initializers) == 1: + return filtered_initializers[0], filtered_initargs[0] + else: + return _ChainedInitializer(filtered_initializers), filtered_initargs + + +def _prepare_initializer(initializer, initargs): + if initializer is not None and not callable(initializer): + raise TypeError( + f"initializer must be a callable, got: {initializer!r}" + ) + + # Introspect runtime to determine if we need to propagate the viztracer + # profiler information to the workers: + return _chain_initializers([ + (initializer, initargs), + _make_viztracer_initializer_and_initargs(), + ]) diff --git a/joblib/externals/loky/process_executor.py b/joblib/externals/loky/process_executor.py index 41e4a2b57..4e3e819ce 100644 --- a/joblib/externals/loky/process_executor.py +++ b/joblib/externals/loky/process_executor.py @@ -4,7 +4,6 @@ # author: Thomas Moreau and Olivier Grisel # # adapted from concurrent/futures/process_pool_executor.py (17/02/2017) -# * Backport for python2.7/3.3, # * Add an extra management thread to detect executor_manager_thread failures, # * Improve the shutdown process to avoid deadlocks, # * Add timeout for workers, @@ -62,36 +61,29 @@ import os import gc import sys +import queue import struct import weakref import warnings import itertools import traceback import threading -from time import time +from time import time, sleep import multiprocessing as mp from functools import partial from pickle import PicklingError +from concurrent.futures import Executor +from concurrent.futures._base import LOGGER +from concurrent.futures.process import BrokenProcessPool as _BPPException +from multiprocessing.connection import wait -from . import _base +from ._base import Future from .backend import get_context -from .backend.compat import queue -from .backend.compat import wait -from .backend.compat import set_cause from .backend.context import cpu_count from .backend.queues import Queue, SimpleQueue from .backend.reduction import set_loky_pickler, get_loky_pickler_name -from .backend.utils import recursive_terminate, get_exitcodes_terminated_worker - -try: - from concurrent.futures.process import BrokenProcessPool as _BPPException -except ImportError: - _BPPException = RuntimeError - - -# Compatibility for python2.7 -if sys.version_info[0] == 2: - ProcessLookupError = OSError +from .backend.utils import kill_process_tree, get_exitcodes_terminated_worker +from .initializers import _prepare_initializer # Mechanism to prevent infinite process spawning. When a worker of a @@ -115,7 +107,9 @@ def _get_memory_usage(pid, force_gc=False): if force_gc: gc.collect() - return Process(pid).memory_info().rss + mem_size = Process(pid).memory_info().rss + mp.util.debug(f'psutil return memory size: {mem_size}') + return mem_size except ImportError: _USE_PSUTIL = False @@ -134,12 +128,7 @@ def close(self): def wakeup(self): if not self._closed: - if sys.platform == "win32" and sys.version_info[:2] < (3, 4): - # Compat for python2.7 on windows, where poll return false for - # b"" messages. Use the slightly larger message b"0". - self._writer.send_bytes(b"0") - else: - self._writer.send_bytes(b"") + self._writer.send_bytes(b"") def clear(self): if not self._closed: @@ -147,7 +136,7 @@ def clear(self): self._reader.recv_bytes() -class _ExecutorFlags(object): +class _ExecutorFlags: """necessary references to maintain executor states without preventing gc It permits to keep the information needed by executor_manager_thread @@ -195,8 +184,9 @@ def _python_exit(): global _global_shutdown _global_shutdown = True items = list(_threads_wakeups.items()) - mp.util.debug("Interpreter shutting down. Waking up " - "executor_manager_thread {}".format(items)) + if len(items) > 0: + mp.util.debug("Interpreter shutting down. Waking up " + f"executor_manager_thread {items}") for _, (shutdown_lock, thread_wakeup) in items: with shutdown_lock: thread_wakeup.wakeup() @@ -224,7 +214,7 @@ class _RemoteTraceback(Exception): """Embed stringification of remote traceback in local traceback """ def __init__(self, tb=None): - self.tb = '\n"""\n{}"""'.format(tb) + self.tb = f'\n"""\n{tb}"""' def __str__(self): return self.tb @@ -246,11 +236,11 @@ def __reduce__(self): def _rebuild_exc(exc, tb): - exc = set_cause(exc, _RemoteTraceback(tb)) + exc.__cause__ = _RemoteTraceback(tb) return exc -class _WorkItem(object): +class _WorkItem: __slots__ = ["future", "fn", "args", "kwargs"] @@ -261,7 +251,7 @@ def __init__(self, future, fn, args, kwargs): self.kwargs = kwargs -class _ResultItem(object): +class _ResultItem: def __init__(self, work_id, exception=None, result=None): self.work_id = work_id @@ -269,7 +259,7 @@ def __init__(self, work_id, exception=None, result=None): self.result = result -class _CallItem(object): +class _CallItem: def __init__(self, work_id, fn, args, kwargs): self.work_id = work_id @@ -285,8 +275,9 @@ def __call__(self): return self.fn(*self.args, **self.kwargs) def __repr__(self): - return "CallItem({}, {}, {}, {})".format( - self.work_id, self.fn, self.args, self.kwargs) + return ( + f"CallItem({self.work_id}, {self.fn}, {self.args}, {self.kwargs})" + ) class _SafeQueue(Queue): @@ -296,7 +287,7 @@ def __init__(self, max_size=0, ctx=None, pending_work_items=None, self.thread_wakeup = thread_wakeup self.pending_work_items = pending_work_items self.running_work_items = running_work_items - super(_SafeQueue, self).__init__(max_size, reducers=reducers, ctx=ctx) + super().__init__(max_size, reducers=reducers, ctx=ctx) def _on_queue_feeder_error(self, e, obj): if isinstance(obj, _CallItem): @@ -310,8 +301,7 @@ def _on_queue_feeder_error(self, e, obj): "Could not pickle the task to send it to the workers.") tb = traceback.format_exception( type(e), e, getattr(e, "__traceback__", None)) - raised_error = set_cause(raised_error, - _RemoteTraceback(''.join(tb))) + raised_error.__cause__ = _RemoteTraceback(''.join(tb)) work_item = self.pending_work_items.pop(obj.work_id, None) self.running_work_items.remove(obj.work_id) # work_item can be None if another process terminated. In this @@ -322,15 +312,12 @@ def _on_queue_feeder_error(self, e, obj): del work_item self.thread_wakeup.wakeup() else: - super(_SafeQueue, self)._on_queue_feeder_error(e, obj) + super()._on_queue_feeder_error(e, obj) def _get_chunks(chunksize, *iterables): """Iterates over zip()ed iterables in chunks. """ - if sys.version_info < (3, 3): - it = itertools.izip(*iterables) - else: - it = zip(*iterables) + it = zip(*iterables) while True: chunk = tuple(itertools.islice(it, chunksize)) if not chunk: @@ -374,8 +361,8 @@ def _process_worker(call_queue, result_queue, initializer, initargs, to by the worker. initializer: A callable initializer, or None initargs: A tuple of args for the initializer - process_management_lock: A ctx.Lock avoiding worker timeout while some - workers are being spawned. + processes_management_lock: A ctx.Lock avoiding worker timeout while + some workers are being spawned. timeout: maximum time to wait for a new item in the call_queue. If that time is expired, the worker will shutdown. worker_exit_lock: Lock to avoid flagging the executor as broken on @@ -386,7 +373,7 @@ def _process_worker(call_queue, result_queue, initializer, initargs, try: initializer(*initargs) except BaseException: - _base.LOGGER.critical('Exception in initializer:', exc_info=True) + LOGGER.critical('Exception in initializer:', exc_info=True) # The parent will notice that the process stopped and # mark the pool broken return @@ -398,15 +385,14 @@ def _process_worker(call_queue, result_queue, initializer, initargs, _last_memory_leak_check = None pid = os.getpid() - mp.util.debug('Worker started with timeout=%s' % timeout) + mp.util.debug(f'Worker started with timeout={timeout}') while True: try: call_item = call_queue.get(block=True, timeout=timeout) if call_item is None: mp.util.info("Shutting down worker on sentinel") except queue.Empty: - mp.util.info("Shutting down worker after timeout %0.3fs" - % timeout) + mp.util.info(f"Shutting down worker after timeout {timeout:0.3f}s") if processes_management_lock.acquire(block=False): processes_management_lock.release() call_item = None @@ -421,12 +407,23 @@ def _process_worker(call_queue, result_queue, initializer, initargs, # If we cannot format correctly the exception, at least print # the traceback. print(previous_tb) + mp.util.debug('Exiting with code 1') sys.exit(1) if call_item is None: - # Notify queue management thread about clean worker shutdown + # Notify queue management thread about worker shutdown result_queue.put(pid) - with worker_exit_lock: - return + is_clean = worker_exit_lock.acquire(True, timeout=30) + + # Early notify any loky executor running in this worker process + # (nested parallelism) that this process is about to shutdown to + # avoid a deadlock waiting undifinitely for the worker to finish. + _python_exit() + + if is_clean: + mp.util.debug('Exited cleanly') + else: + mp.util.info('Main process did not release worker_exit') + return try: r = call_item() except BaseException as e: @@ -467,11 +464,12 @@ def _process_worker(call_queue, result_queue, initializer, initargs, mp.util.info("Memory leak detected: shutting down worker") result_queue.put(pid) with worker_exit_lock: + mp.util.debug('Exit due to memory leak') return else: # if psutil is not installed, trigger gc.collect events # regularly to limit potential memory leaks due to reference cycles - if ((_last_memory_leak_check is None) or + if (_last_memory_leak_check is None or (time() - _last_memory_leak_check > _MEMORY_LEAK_CHECK_DELAY)): gc.collect() @@ -508,8 +506,12 @@ def __init__(self, executor): def weakref_cb(_, thread_wakeup=self.thread_wakeup, shutdown_lock=self.shutdown_lock): - mp.util.debug('Executor collected: triggering callback for' - ' QueueManager wakeup') + if mp is not None: + # At this point, the multiprocessing module can already be + # garbage collected. We only log debug info when still + # possible. + mp.util.debug('Executor collected: triggering callback for' + ' QueueManager wakeup') with shutdown_lock: thread_wakeup.wakeup() @@ -542,7 +544,7 @@ def weakref_cb(_, # of new processes or shut down self.processes_management_lock = executor._processes_management_lock - super(_ExecutorManagerThread, self).__init__() + super().__init__(name="ExecutorManagerThread") if sys.version_info < (3, 9): self.daemon = True @@ -619,7 +621,7 @@ def wait_result_broken_or_wakeup(self): "A task has failed to un-serialize. Please ensure that" " the arguments of the function are all picklable." ) - set_cause(bpe, result_item) + bpe.__cause__ = result_item else: is_broken = False except BaseException as e: @@ -630,7 +632,7 @@ def wait_result_broken_or_wakeup(self): ) tb = traceback.format_exception( type(e), e, getattr(e, "__traceback__", None)) - set_cause(bpe, _RemoteTraceback(''.join(tb))) + bpe.__cause__ = _RemoteTraceback(''.join(tb)) elif wakeup_reader in ready: # This is simply a wake-up event that might either trigger putting @@ -644,14 +646,21 @@ def wait_result_broken_or_wakeup(self): # In Windows, introspecting terminated workers exitcodes seems # unstable, therefore they are not appended in the exception # message. - exit_codes = "\nThe exit codes of the workers are {}".format( - get_exitcodes_terminated_worker(self.processes)) + exit_codes = ( + "\nThe exit codes of the workers are " + f"{get_exitcodes_terminated_worker(self.processes)}" + ) + mp.util.debug('A worker unexpectedly terminated. Workers that ' + 'might have caused the breakage: ' + + str({p.name: p.exitcode + for p in list(self.processes.values()) + if p is not None and p.sentinel in ready})) bpe = TerminatedWorkerError( "A worker process managed by the executor was unexpectedly " "terminated. This could be caused by a segmentation fault " "while calling the function or by an excessive memory usage " "causing the Operating System to kill the worker.\n" - "{}".format(exit_codes) + f"{exit_codes}" ) self.thread_wakeup.clear() @@ -669,9 +678,12 @@ def process_result_item(self, result_item): with self.processes_management_lock: p = self.processes.pop(result_item, None) - # p can be None is the executor is concurrently shutting down. + # p can be None if the executor is concurrently shutting down. if p is not None: p._worker_exit_lock.release() + mp.util.debug( + f"joining {p.name} when processing {p.pid} as result_item" + ) p.join() del p @@ -690,7 +702,8 @@ def process_result_item(self, result_item): "executor. This can be caused by a too short worker " "timeout or by a memory leak.", UserWarning ) - executor._adjust_process_count() + with executor._processes_management_lock: + executor._adjust_process_count() executor = None else: # Received a _ResultItem so mark the future as completed. @@ -725,7 +738,7 @@ def terminate_broken(self, bpe): self.executor_flags.flag_as_broken(bpe) # Mark pending tasks as failed. - for work_id, work_item in self.pending_work_items.items(): + for work_item in self.pending_work_items.values(): work_item.future.set_exception(bpe) # Delete references to object. See issue16284 del work_item @@ -733,7 +746,7 @@ def terminate_broken(self, bpe): # Terminate remaining workers forcibly: the queues or their # locks may be in a dirty state and block forever. - self.kill_workers() + self.kill_workers(reason="broken executor") # clean up resources self.join_executor_internals() @@ -753,17 +766,17 @@ def flag_executor_shutting_down(self): del work_item # Kill the remaining worker forcibly to no waste time joining them - self.kill_workers() + self.kill_workers(reason="executor shutting down") - def kill_workers(self): + def kill_workers(self, reason=''): # Terminate the remaining workers using SIGKILL. This function also # terminates descendant workers of the children in case there is some # nested parallelism. while self.processes: _, p = self.processes.popitem() - mp.util.debug('terminate process {}'.format(p.name)) + mp.util.debug(f"terminate process {p.name}, reason: {reason}") try: - recursive_terminate(p) + kill_process_tree(p) except ProcessLookupError: # pragma: no cover pass @@ -777,22 +790,36 @@ def shutdown_workers(self): with self.processes_management_lock: n_children_to_stop = 0 for p in list(self.processes.values()): + mp.util.debug(f"releasing worker exit lock on {p.name}") p._worker_exit_lock.release() n_children_to_stop += 1 + mp.util.debug(f"found {n_children_to_stop} processes to stop") + # Send the right number of sentinels, to make sure all children are # properly terminated. Do it with a mechanism that avoid hanging on # Full queue when all workers have already been shutdown. n_sentinels_sent = 0 + cooldown_time = 0.001 while (n_sentinels_sent < n_children_to_stop and self.get_n_children_alive() > 0): - for i in range(n_children_to_stop - n_sentinels_sent): + for _ in range(n_children_to_stop - n_sentinels_sent): try: self.call_queue.put_nowait(None) n_sentinels_sent += 1 - except queue.Full: + except queue.Full as e: + if cooldown_time > 10.0: + raise e + mp.util.info( + "full call_queue prevented to send all sentinels at " + "once, waiting..." + ) + sleep(cooldown_time) + cooldown_time *= 2 break + mp.util.debug(f"sent {n_sentinels_sent} sentinels to the call queue") + def join_executor_internals(self): self.shutdown_workers() @@ -814,13 +841,23 @@ def join_executor_internals(self): self.thread_wakeup.close() # If .join() is not called on the created processes then - # some ctx.Queue methods may deadlock on Mac OS X. - mp.util.debug("joining processes") - for p in list(self.processes.values()): - p.join() + # some ctx.Queue methods may deadlock on macOS. + with self.processes_management_lock: + mp.util.debug(f"joining {len(self.processes)} processes") + n_joined_processes = 0 + while True: + try: + pid, p = self.processes.popitem() + mp.util.debug(f"joining process {p.name} with pid {pid}") + p.join() + n_joined_processes += 1 + except KeyError: + break - mp.util.debug("executor management thread clean shutdown of worker " - "processes: {}".format(list(self.processes))) + mp.util.debug( + "executor management thread clean shutdown of " + f"{n_joined_processes} workers" + ) def get_n_children_alive(self): # This is an upper bound on the number of children alive. @@ -834,9 +871,8 @@ def get_n_children_alive(self): def _check_system_limits(): global _system_limits_checked, _system_limited - if _system_limits_checked: - if _system_limited: - raise NotImplementedError(_system_limited) + if _system_limits_checked and _system_limited: + raise NotImplementedError(_system_limited) _system_limits_checked = True try: nsems_max = os.sysconf("SC_SEM_NSEMS_MAX") @@ -851,8 +887,10 @@ def _check_system_limits(): # minimum number of semaphores available # according to POSIX return - _system_limited = ("system provides too few semaphores (%d available, " - "256 necessary)" % nsems_max) + _system_limited = ( + f"system provides too few semaphores ({nsems_max} available, " + "256 necessary)" + ) raise NotImplementedError(_system_limited) @@ -880,8 +918,8 @@ def _check_max_depth(context): if 0 < MAX_DEPTH and _CURRENT_DEPTH + 1 > MAX_DEPTH: raise LokyRecursionError( "Could not spawn extra nested processes at depth superior to " - "MAX_DEPTH={}. If this is intendend, you can change this limit " - "with the LOKY_MAX_DEPTH environment variable.".format(MAX_DEPTH)) + f"MAX_DEPTH={MAX_DEPTH}. If this is intendend, you can change " + "this limit with the LOKY_MAX_DEPTH environment variable.") class LokyRecursionError(RuntimeError): @@ -918,7 +956,7 @@ class ShutdownExecutorError(RuntimeError): """ -class ProcessPoolExecutor(_base.Executor): +class ProcessPoolExecutor(Executor): _at_exit = None @@ -947,8 +985,7 @@ def __init__(self, max_workers=None, job_reducers=None, initargs: A tuple of arguments to pass to the initializer. env: A dict of environment variable to overwrite in the child process. The environment variables are set before any module is - loaded. Note that this only works with the loky context and it - is unreliable under windows with Python < 3.6. + loaded. Note that this only works with the loky context. """ _check_system_limits() @@ -964,11 +1001,9 @@ def __init__(self, max_workers=None, job_reducers=None, self._context = context self._env = env - if initializer is not None and not callable(initializer): - raise TypeError("initializer must be a callable") - self._initializer = initializer - self._initargs = initargs - + self._initializer, self._initargs = _prepare_initializer( + initializer, initargs + ) _check_max_depth(self._context) if result_reducers is None: @@ -1035,17 +1070,6 @@ def _start_executor_manager_thread(self): if self._executor_manager_thread is None: mp.util.debug('_start_executor_manager_thread called') - # When the executor gets garbarge collected, the weakref callback - # will wake up the queue management thread so that it can terminate - # if there is no pending work item. - def weakref_cb( - _, thread_wakeup=self._executor_manager_thread_wakeup, - shutdown_lock=self._shutdown_lock): - mp.util.debug('Executor collected: triggering callback for' - ' QueueManager wakeup') - with self._shutdown_lock: - thread_wakeup.wakeup() - # Start the processes so that their sentinels are known. self._executor_manager_thread = _ExecutorManagerThread(self) self._executor_manager_thread.start() @@ -1070,7 +1094,7 @@ def weakref_cb( _python_exit) def _adjust_process_count(self): - for _ in range(len(self._processes), self._max_workers): + while len(self._processes) < self._max_workers: worker_exit_lock = self._context.BoundedSemaphore(1) args = (self._call_queue, self._result_queue, self._initializer, self._initargs, self._processes_management_lock, @@ -1086,7 +1110,10 @@ def _adjust_process_count(self): p._worker_exit_lock = worker_exit_lock p.start() self._processes[p.pid] = p - mp.util.debug('Adjust process count : {}'.format(self._processes)) + mp.util.debug( + f"Adjusted process count to {self._max_workers}: " + f"{[(p.name, pid) for pid, p in self._processes.items()]}" + ) def _ensure_executor_running(self): """ensures all workers and management thread are running @@ -1110,7 +1137,7 @@ def submit(self, fn, *args, **kwargs): raise RuntimeError('cannot schedule new futures after ' 'interpreter shutdown') - f = _base.Future() + f = Future() w = _WorkItem(f, fn, args, kwargs) self._pending_work_items[self._queue_count] = w @@ -1121,7 +1148,7 @@ def submit(self, fn, *args, **kwargs): self._ensure_executor_running() return f - submit.__doc__ = _base.Executor.submit.__doc__ + submit.__doc__ = Executor.submit.__doc__ def map(self, fn, *iterables, **kwargs): """Returns an iterator equivalent to map(fn, iter). @@ -1150,13 +1177,14 @@ def map(self, fn, *iterables, **kwargs): if chunksize < 1: raise ValueError("chunksize must be >= 1.") - results = super(ProcessPoolExecutor, self).map( + results = super().map( partial(_process_chunk, fn), _get_chunks(chunksize, *iterables), - timeout=timeout) + timeout=timeout + ) return _chain_from_iterable_of_lists(results) def shutdown(self, wait=True, kill_workers=False): - mp.util.debug('shutting down executor %s' % self) + mp.util.debug(f'shutting down executor {self}') self._flags.flag_as_shutting_down(kill_workers) executor_manager_thread = self._executor_manager_thread @@ -1178,4 +1206,4 @@ def shutdown(self, wait=True, kill_workers=False): self._result_queue = None self._processes_management_lock = None - shutdown.__doc__ = _base.Executor.shutdown.__doc__ + shutdown.__doc__ = Executor.shutdown.__doc__ diff --git a/joblib/externals/loky/reusable_executor.py b/joblib/externals/loky/reusable_executor.py index 9a8e73f37..6b183a0bf 100644 --- a/joblib/externals/loky/reusable_executor.py +++ b/joblib/externals/loky/reusable_executor.py @@ -14,9 +14,6 @@ __all__ = ['get_reusable_executor'] -# Python 2 compat helper -STRING_TYPE = type("") - # Singleton executor and id management _executor_lock = threading.RLock() _next_executor_id = 0 @@ -79,7 +76,7 @@ def get_reusable_executor(max_workers=None, context=None, timeout=10, ``VAR`` are string literals to overwrite the environment variable ``ENV`` in the child processes to value ``VAL``. The environment variables are set in the children before any module is loaded. This only works with with the - ``loky`` context and it is unreliable on Windows with Python < 3.6. + ``loky`` context. """ _executor, _ = _ReusablePoolExecutor.get_reusable_executor( max_workers=max_workers, context=context, timeout=timeout, @@ -95,7 +92,7 @@ def __init__(self, submit_resize_lock, max_workers=None, context=None, timeout=None, executor_id=0, job_reducers=None, result_reducers=None, initializer=None, initargs=(), env=None): - super(_ReusablePoolExecutor, self).__init__( + super().__init__( max_workers=max_workers, context=context, timeout=timeout, job_reducers=job_reducers, result_reducers=result_reducers, initializer=initializer, initargs=initargs, env=env) @@ -118,10 +115,10 @@ def get_reusable_executor(cls, max_workers=None, context=None, timeout=10, max_workers = cpu_count() elif max_workers <= 0: raise ValueError( - "max_workers must be greater than 0, got {}." - .format(max_workers)) + f"max_workers must be greater than 0, got {max_workers}." + ) - if isinstance(context, STRING_TYPE): + if isinstance(context, str): context = get_context(context) if context is not None and context.get_start_method() == "fork": raise ValueError( @@ -135,8 +132,9 @@ def get_reusable_executor(cls, max_workers=None, context=None, timeout=10, env=env) if executor is None: is_reused = False - mp.util.debug("Create a executor with max_workers={}." - .format(max_workers)) + mp.util.debug( + f"Create a executor with max_workers={max_workers}." + ) executor_id = _get_next_executor_id() _executor_kwargs = kwargs _executor = executor = cls( @@ -154,9 +152,10 @@ def get_reusable_executor(cls, max_workers=None, context=None, timeout=10, else: reason = "arguments have changed" mp.util.debug( - "Creating a new executor with max_workers={} as the " - "previous instance cannot be reused ({})." - .format(max_workers, reason)) + "Creating a new executor with max_workers= " + f"{max_workers} as the previous instance cannot be " + f"reused ({reason})." + ) executor.shutdown(wait=True, kill_workers=kill_workers) _executor = executor = _executor_kwargs = None # Recursive call to build a new instance @@ -164,8 +163,8 @@ def get_reusable_executor(cls, max_workers=None, context=None, timeout=10, **kwargs) else: mp.util.debug( - "Reusing existing executor with max_workers={}." - .format(executor._max_workers) + "Reusing existing executor with " + f"max_workers={executor._max_workers}." ) is_reused = True executor._resize(max_workers) @@ -174,8 +173,7 @@ def get_reusable_executor(cls, max_workers=None, context=None, timeout=10, def submit(self, fn, *args, **kwargs): with self._submit_resize_lock: - return super(_ReusablePoolExecutor, self).submit( - fn, *args, **kwargs) + return super().submit(fn, *args, **kwargs) def _resize(self, max_workers): with self._submit_resize_lock: @@ -208,25 +206,28 @@ def _resize(self, max_workers): self._adjust_process_count() processes = list(self._processes.values()) - while not all([p.is_alive() for p in processes]): + while not all(p.is_alive() for p in processes): time.sleep(1e-3) def _wait_job_completion(self): """Wait for the cache to be empty before resizing the pool.""" # Issue a warning to the user about the bad effect of this usage. - if len(self._pending_work_items) > 0: + if self._pending_work_items: warnings.warn("Trying to resize an executor with running jobs: " "waiting for jobs completion before resizing.", UserWarning) - mp.util.debug("Executor {} waiting for jobs completion before" - " resizing".format(self.executor_id)) + mp.util.debug( + f"Executor {self.executor_id} waiting for jobs completion " + "before resizing" + ) # Wait for the completion of the jobs - while len(self._pending_work_items) > 0: + while self._pending_work_items: time.sleep(1e-3) def _setup_queues(self, job_reducers, result_reducers): # As this executor can be resized, use a large queue size to avoid # underestimating capacity and introducing overhead queue_size = 2 * cpu_count() + EXTRA_QUEUED_CALLS - super(_ReusablePoolExecutor, self)._setup_queues( - job_reducers, result_reducers, queue_size=queue_size) + super()._setup_queues( + job_reducers, result_reducers, queue_size=queue_size + ) diff --git a/joblib/func_inspect.py b/joblib/func_inspect.py index ec6bb4a2f..d334a2b9d 100644 --- a/joblib/func_inspect.py +++ b/joblib/func_inspect.py @@ -142,6 +142,13 @@ def get_func_name(func, resolv_alias=True, win_characters=True): # notebooks splitted = parts[-1].split('-') parts[-1] = '-'.join(splitted[:2] + splitted[3:]) + elif len(parts) > 2 and parts[-2].startswith('ipykernel_'): + # In a notebook session (ipykernel). Filename seems to be 'xyz' + # of above. parts[-2] has the structure ipykernel_XXXXXX where + # XXXXXX is a six-digit number identifying the current run (?). + # If we split it off, the function again has the same + # identifier across runs. + parts[-2] = 'ipykernel' filename = '-'.join(parts) if filename.endswith('.py'): filename = filename[:-3] @@ -171,10 +178,9 @@ def get_func_name(func, resolv_alias=True, win_characters=True): return module, name -def _signature_str(function_name, arg_spec): +def _signature_str(function_name, arg_sig): """Helper function to output a function signature""" - arg_spec_str = inspect.formatargspec(*arg_spec) - return '{}{}'.format(function_name, arg_spec_str) + return '{}{}'.format(function_name, arg_sig) def _function_called_str(function_name, args, kwargs): @@ -221,20 +227,34 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()): warnings.warn('Cannot inspect object %s, ignore list will ' 'not work.' % func, stacklevel=2) return {'*': args, '**': kwargs} - arg_spec = inspect.getfullargspec(func) - arg_names = arg_spec.args + arg_spec.kwonlyargs - arg_defaults = arg_spec.defaults or () - if arg_spec.kwonlydefaults: - arg_defaults = arg_defaults + tuple(arg_spec.kwonlydefaults[k] - for k in arg_spec.kwonlyargs - if k in arg_spec.kwonlydefaults) - arg_varargs = arg_spec.varargs - arg_varkw = arg_spec.varkw - + arg_sig = inspect.signature(func) + arg_names = [] + arg_defaults = [] + arg_kwonlyargs = [] + arg_varargs = None + arg_varkw = None + for param in arg_sig.parameters.values(): + if param.kind is param.POSITIONAL_OR_KEYWORD: + arg_names.append(param.name) + elif param.kind is param.KEYWORD_ONLY: + arg_names.append(param.name) + arg_kwonlyargs.append(param.name) + elif param.kind is param.VAR_POSITIONAL: + arg_varargs = param.name + elif param.kind is param.VAR_KEYWORD: + arg_varkw = param.name + if param.default is not param.empty: + arg_defaults.append(param.default) if inspect.ismethod(func): # First argument is 'self', it has been removed by Python # we need to add it back: args = [func.__self__, ] + args + # func is an instance method, inspect.signature(func) does not + # include self, we need to fetch it from the class method, i.e + # func.__func__ + class_method_sig = inspect.signature(func.__func__) + self_name = next(iter(class_method_sig.parameters)) + arg_names = [self_name] + arg_names # XXX: Maybe I need an inspect.isbuiltin to detect C-level methods, such # as on ndarrays. @@ -244,7 +264,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()): for arg_position, arg_name in enumerate(arg_names): if arg_position < len(args): # Positional argument or keyword argument given as positional - if arg_name not in arg_spec.kwonlyargs: + if arg_name not in arg_kwonlyargs: arg_dict[arg_name] = args[arg_position] else: raise ValueError( @@ -252,7 +272,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()): 'positional parameter for %s:\n' ' %s was called.' % (arg_name, - _signature_str(name, arg_spec), + _signature_str(name, arg_sig), _function_called_str(name, args, kwargs)) ) @@ -268,7 +288,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()): raise ValueError( 'Wrong number of arguments for %s:\n' ' %s was called.' - % (_signature_str(name, arg_spec), + % (_signature_str(name, arg_sig), _function_called_str(name, args, kwargs)) ) from e @@ -296,7 +316,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()): raise ValueError("Ignore list: argument '%s' is not defined for " "function %s" % (item, - _signature_str(name, arg_spec)) + _signature_str(name, arg_sig)) ) # XXX: Return a sorted list of pairs? return arg_dict diff --git a/joblib/hashing.py b/joblib/hashing.py index 24aeb559d..b983e84fa 100644 --- a/joblib/hashing.py +++ b/joblib/hashing.py @@ -193,7 +193,7 @@ def save(self, obj): obj_c_contiguous = obj.T else: # Cater for non-single-segment arrays: this creates a - # copy, and thus aleviates this issue. + # copy, and thus alleviates this issue. # XXX: There might be a more efficient way of doing this obj_c_contiguous = obj.flatten() diff --git a/joblib/memory.py b/joblib/memory.py index 424d9fea6..fdc58fdfd 100644 --- a/joblib/memory.py +++ b/joblib/memory.py @@ -19,7 +19,6 @@ import traceback import warnings import inspect -import sys import weakref from tokenize import open as open_py_source @@ -33,7 +32,6 @@ from ._store_backends import StoreBackendBase, FileSystemStoreBackend - FIRST_LINE_TEXT = "# first line:" # TODO: The following object should have a data store object as a sub @@ -132,11 +130,10 @@ def _store_backend_factory(backend, location, verbose=0, backend_options=None): return obj elif location is not None: warnings.warn( - "Instanciating a backend using a {} as a location is not " + "Instantiating a backend using a {} as a location is not " "supported by joblib. Returning None instead.".format( location.__class__.__name__), UserWarning) - return None @@ -199,7 +196,7 @@ class MemorizedResult(Logger): func: function or str function whose output is cached. The string case is intended only for - instanciation based on the output of repr() on another instance. + instantiation based on the output of repr() on another instance. (namely eval(repr(memorized_instance)) works). argument_hash: str @@ -361,6 +358,12 @@ def clear(self, warn=True): # Argument "warn" is for compatibility with MemorizedFunc.clear pass + def call(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def check_call_in_cache(self, *args, **kwargs): + return False + ############################################################################### # class `MemorizedFunc` @@ -484,7 +487,7 @@ def _cached_call(self, args, kwargs, shelving=False): metadata = None msg = None - # Wether or not the memorized function must be called + # Whether or not the memorized function must be called must_call = False # FIXME: The statements below should be try/excepted @@ -560,8 +563,8 @@ def func_code_info(self): # (which should be called once on self) gets called in the process # in which self.func was defined, this caching mechanism prevents # undesired cache clearing when the cached function is called in - # an environement where the introspection utilities get_func_code - # relies on do not work (typicially, in joblib child processes). + # an environment where the introspection utilities get_func_code + # relies on do not work (typically, in joblib child processes). # See #1035 for more info # TODO (pierreglaser): do the same with get_func_name? self._func_code_info = get_func_code(self.func) @@ -606,6 +609,21 @@ def __getstate__(self): return state + def check_call_in_cache(self, *args, **kwargs): + """Check if function call is in the memory cache. + + Does not call the function or do any work besides func inspection + and arg hashing. + + Returns + ------- + is_call_in_cache: bool + Whether or not the result of the function has been cached + for the input arguments that have been passed. + """ + func_id, args_id = self._get_output_identifiers(*args, **kwargs) + return self.store_backend.contains_item((func_id, args_id)) + # ------------------------------------------------------------------------ # Private interface # ------------------------------------------------------------------------ @@ -683,8 +701,8 @@ def _check_previous_func_code(self, stacklevel=2): extract_first_line( self.store_backend.get_cached_func_code([func_id])) except (IOError, OSError): # some backend can also raise OSError - self._write_func_code(func_code, first_line) - return False + self._write_func_code(func_code, first_line) + return False if old_func_code == func_code: return True @@ -821,8 +839,6 @@ def _persist_input(self, duration, args, kwargs, this_duration_limit=0.5): % this_duration, stacklevel=5) return metadata - # XXX: Need a method to check if results are available. - # ------------------------------------------------------------------------ # Private `object` interface # ------------------------------------------------------------------------ @@ -848,7 +864,7 @@ class Memory(Logger): Parameters ---------- - location: str or None + location: str, pathlib.Path or None The path of the base directory to use as a data store or None. If None is given, no caching is done and the Memory object is completely transparent. This option @@ -860,12 +876,6 @@ class Memory(Logger): The 'local' backend is using regular filesystem operations to manipulate data (open, mv, etc) in the backend. - cachedir: str or None, optional - - .. deprecated: 0.12 - 'cachedir' has been deprecated in 0.12 and will be - removed in 0.14. Use the 'location' parameter instead. - mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional The memmapping mode used when loading from cache numpy arrays. See numpy.load for the meaning of the @@ -882,20 +892,24 @@ class Memory(Logger): as functions are evaluated. bytes_limit: int, optional - Limit in bytes of the size of the cache. + Limit in bytes of the size of the cache. By default, the size of + the cache is unlimited. When reducing the size of the cache, + ``joblib`` keeps the most recently accessed items first. + + **Note:** You need to call :meth:`joblib.Memory.reduce_size` to + actually reduce the cache size to be less than ``bytes_limit``. backend_options: dict, optional - Contains a dictionnary of named parameters used to configure + Contains a dictionary of named parameters used to configure the store backend. """ # ------------------------------------------------------------------------ # Public interface # ------------------------------------------------------------------------ - def __init__(self, location=None, backend='local', cachedir=None, + def __init__(self, location=None, backend='local', mmap_mode=None, compress=False, verbose=1, bytes_limit=None, backend_options=None): - # XXX: Bad explanation of the None value of cachedir Logger.__init__(self) self._verbose = verbose self.mmap_mode = mmap_mode @@ -910,22 +924,6 @@ def __init__(self, location=None, backend='local', cachedir=None, if compress and mmap_mode is not None: warnings.warn('Compressed results cannot be memmapped', stacklevel=2) - if cachedir is not None: - if location is not None: - raise ValueError( - 'You set both "location={0!r} and "cachedir={1!r}". ' - "'cachedir' has been deprecated in version " - "0.12 and will be removed in version 0.14.\n" - 'Please only set "location={0!r}"'.format( - location, cachedir)) - - warnings.warn( - "The 'cachedir' parameter has been deprecated in version " - "0.12 and will be removed in version 0.14.\n" - 'You provided "cachedir={0!r}", ' - 'use "location={0!r}" instead.'.format(cachedir), - DeprecationWarning, stacklevel=2) - location = cachedir self.location = location if isinstance(location, str): @@ -936,17 +934,6 @@ def __init__(self, location=None, backend='local', cachedir=None, backend_options=dict(compress=compress, mmap_mode=mmap_mode, **backend_options)) - @property - def cachedir(self): - warnings.warn( - "The 'cachedir' attribute has been deprecated in version 0.12 " - "and will be removed in version 0.14.\n" - "Use os.path.join(memory.location, 'joblib') attribute instead.", - DeprecationWarning, stacklevel=2) - if self.location is None: - return None - return os.path.join(self.location, 'joblib') - def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False): """ Decorates the given function func to only compute its return value for input arguments not cached on disk. @@ -1000,6 +987,12 @@ def clear(self, warn=True): if self.store_backend is not None: self.store_backend.clear() + # As the cache in completely clear, make sure the _FUNCTION_HASHES + # cache is also reset. Else, for a function that is present in this + # table, results cached after this clear will be have cache miss + # as the function code is not re-written. + _FUNCTION_HASHES.clear() + def reduce_size(self): """Remove cache elements to make cache size fit in ``bytes_limit``.""" if self.bytes_limit is not None and self.store_backend is not None: diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 93e5537ea..fa450fbba 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -7,10 +7,8 @@ import pickle import os import warnings -try: - from pathlib import Path -except ImportError: - Path = None +import io +from pathlib import Path from .compressor import lz4, LZ4_NOT_INSTALLED_ERROR from .compressor import _COMPRESSORS, register_compressor, BinaryZlibFile @@ -20,6 +18,7 @@ from .numpy_pickle_utils import Unpickler, Pickler from .numpy_pickle_utils import _read_fileobject, _write_fileobject from .numpy_pickle_utils import _read_bytes, BUFFER_SIZE +from .numpy_pickle_utils import _ensure_native_byte_order from .numpy_pickle_compat import load_compatibility from .numpy_pickle_compat import NDArrayWrapper # For compatibility with old versions of joblib, we need ZNDArrayWrapper @@ -41,6 +40,11 @@ ############################################################################### # Utility objects for persistence. +# For convenience, 16 bytes are used to be sure to cover all the possible +# dtypes' alignments. For reference, see: +# https://numpy.org/devdocs/dev/alignment.html +NUMPY_ARRAY_ALIGNMENT_BYTES = 16 + class NumpyArrayWrapper(object): """An object to be persisted instead of numpy arrays. @@ -72,13 +76,23 @@ class NumpyArrayWrapper(object): Default: False. """ - def __init__(self, subclass, shape, order, dtype, allow_mmap=False): + def __init__(self, subclass, shape, order, dtype, allow_mmap=False, + numpy_array_alignment_bytes=NUMPY_ARRAY_ALIGNMENT_BYTES): """Constructor. Store the useful information for later.""" self.subclass = subclass self.shape = shape self.order = order self.dtype = dtype self.allow_mmap = allow_mmap + # We make numpy_array_alignment_bytes an instance attribute to allow us + # to change our mind about the default alignment and still load the old + # pickles (with the previous alignment) correctly + self.numpy_array_alignment_bytes = numpy_array_alignment_bytes + + def safe_get_numpy_array_alignment_bytes(self): + # NumpyArrayWrapper instances loaded from joblib <= 1.1 pickles don't + # have an numpy_array_alignment_bytes attribute + return getattr(self, 'numpy_array_alignment_bytes', None) def write_array(self, array, pickler): """Write array bytes to pickler file handle. @@ -94,6 +108,23 @@ def write_array(self, array, pickler): # pickle protocol. pickle.dump(array, pickler.file_handle, protocol=2) else: + numpy_array_alignment_bytes = \ + self.safe_get_numpy_array_alignment_bytes() + if numpy_array_alignment_bytes is not None: + current_pos = pickler.file_handle.tell() + pos_after_padding_byte = current_pos + 1 + padding_length = numpy_array_alignment_bytes - ( + pos_after_padding_byte % numpy_array_alignment_bytes) + # A single byte is written that contains the padding length in + # bytes + padding_length_byte = int.to_bytes( + padding_length, length=1, byteorder='little') + pickler.file_handle.write(padding_length_byte) + + if padding_length != 0: + padding = b'\xff' * padding_length + pickler.file_handle.write(padding) + for chunk in pickler.np.nditer(array, flags=['external_loop', 'buffered', @@ -120,6 +151,15 @@ def read_array(self, unpickler): # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: + numpy_array_alignment_bytes = \ + self.safe_get_numpy_array_alignment_bytes() + if numpy_array_alignment_bytes is not None: + padding_byte = unpickler.file_handle.read(1) + padding_length = int.from_bytes( + padding_byte, byteorder='little') + if padding_length != 0: + unpickler.file_handle.read(padding_length) + # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, @@ -147,11 +187,22 @@ def read_array(self, unpickler): else: array.shape = self.shape - return array + # Detect byte order mismatch and swap as needed. + return _ensure_native_byte_order(array) def read_mmap(self, unpickler): """Read an array using numpy memmap.""" - offset = unpickler.file_handle.tell() + current_pos = unpickler.file_handle.tell() + offset = current_pos + numpy_array_alignment_bytes = \ + self.safe_get_numpy_array_alignment_bytes() + + if numpy_array_alignment_bytes is not None: + padding_byte = unpickler.file_handle.read(1) + padding_length = int.from_bytes(padding_byte, byteorder='little') + # + 1 is for the padding byte + offset += padding_length + 1 + if unpickler.mmap_mode == 'w+': unpickler.mmap_mode = 'r+' @@ -164,6 +215,20 @@ def read_mmap(self, unpickler): # update the offset so that it corresponds to the end of the read array unpickler.file_handle.seek(offset + marray.nbytes) + if (numpy_array_alignment_bytes is None and + current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES != 0): + message = ( + f'The memmapped array {marray} loaded from the file ' + f'{unpickler.file_handle.name} is not not bytes aligned. ' + 'This may cause segmentation faults if this memmapped array ' + 'is used in some libraries like BLAS or PyTorch. ' + 'To get rid of this warning, regenerate your pickle file ' + 'with joblib >= 1.2.0. ' + 'See https://github.com/joblib/joblib/issues/563 ' + 'for more details' + ) + warnings.warn(message) + return marray def read(self, unpickler): @@ -240,9 +305,17 @@ def _create_array_wrapper(self, array): order = 'F' if (array.flags.f_contiguous and not array.flags.c_contiguous) else 'C' allow_mmap = not self.buffered and not array.dtype.hasobject + + kwargs = {} + try: + self.file_handle.tell() + except io.UnsupportedOperation: + kwargs = {'numpy_array_alignment_bytes': None} + wrapper = NumpyArrayWrapper(type(array), array.shape, order, array.dtype, - allow_mmap=allow_mmap) + allow_mmap=allow_mmap, + **kwargs) return wrapper diff --git a/joblib/numpy_pickle_compat.py b/joblib/numpy_pickle_compat.py index 6541a066a..5316c0225 100644 --- a/joblib/numpy_pickle_compat.py +++ b/joblib/numpy_pickle_compat.py @@ -9,7 +9,7 @@ from .numpy_pickle_utils import _ZFILE_PREFIX from .numpy_pickle_utils import Unpickler - +from .numpy_pickle_utils import _ensure_native_byte_order def hex_str(an_int): """Convert an int to an hexadecimal string.""" @@ -63,7 +63,7 @@ def write_zfile(file_handle, data, compress=1): """Write the data in the given file as a Z-file. Z-files are raw data compressed with zlib used internally by joblib - for persistence. Backward compatibility is not guarantied. Do not + for persistence. Backward compatibility is not guaranteed. Do not use for external purposes. """ file_handle.write(_ZFILE_PREFIX) @@ -105,6 +105,9 @@ def read(self, unpickler): kwargs["allow_pickle"] = True array = unpickler.np.load(filename, **kwargs) + # Detect byte order mismatch and swap as needed. + array = _ensure_native_byte_order(array) + # Reconstruct subclasses. This does not work with old # versions of numpy if (hasattr(array, '__array_prepare__') and diff --git a/joblib/numpy_pickle_utils.py b/joblib/numpy_pickle_utils.py index a50105547..71f2c7c59 100644 --- a/joblib/numpy_pickle_utils.py +++ b/joblib/numpy_pickle_utils.py @@ -6,6 +6,7 @@ import pickle import io +import sys import warnings import contextlib @@ -48,6 +49,30 @@ def _get_prefixes_max_len(): return max(prefixes) +def _is_numpy_array_byte_order_mismatch(array): + """Check if numpy array is having byte order mismatch""" + return ((sys.byteorder == 'big' and + (array.dtype.byteorder == '<' or + (array.dtype.byteorder == '|' and array.dtype.fields and + all(e[0].byteorder == '<' + for e in array.dtype.fields.values())))) or + (sys.byteorder == 'little' and + (array.dtype.byteorder == '>' or + (array.dtype.byteorder == '|' and array.dtype.fields and + all(e[0].byteorder == '>' + for e in array.dtype.fields.values()))))) + + +def _ensure_native_byte_order(array): + """Use the byte order of the host while preserving values + + Does nothing if array already uses the system byte order. + """ + if _is_numpy_array_byte_order_mismatch(array): + array = array.byteswap().newbyteorder('=') + return array + + ############################################################################### # Cache file utilities def _detect_compressor(fileobj): diff --git a/joblib/parallel.py b/joblib/parallel.py index 17a9f2313..6e7b1b19a 100644 --- a/joblib/parallel.py +++ b/joblib/parallel.py @@ -27,7 +27,7 @@ ThreadingBackend, SequentialBackend, LokyBackend) from .externals.cloudpickle import dumps, loads -from .externals import loky +from ._utils import eval_expr # Make sure that those two classes are part of the public joblib.parallel API # so that 3rd party backend implementers can import them from here. @@ -36,15 +36,28 @@ BACKENDS = { - 'multiprocessing': MultiprocessingBackend, 'threading': ThreadingBackend, 'sequential': SequentialBackend, - 'loky': LokyBackend, } # name of the backend used by default by Parallel outside of any context # managed by ``parallel_backend``. -DEFAULT_BACKEND = 'loky' + +# threading is the only backend that is always everywhere +DEFAULT_BACKEND = 'threading' + DEFAULT_N_JOBS = 1 + +MAYBE_AVAILABLE_BACKENDS = {'multiprocessing', 'loky'} + +# if multiprocessing is available, so is loky, we set it as the default +# backend +if mp is not None: + BACKENDS['multiprocessing'] = MultiprocessingBackend + from .externals import loky + BACKENDS['loky'] = LokyBackend + DEFAULT_BACKEND = 'loky' + + DEFAULT_THREAD_BACKEND = 'threading' # Thread local value that can be overridden by the ``parallel_backend`` context @@ -123,7 +136,7 @@ class parallel_backend(object): """Change the default backend used by Parallel inside a with block. If ``backend`` is a string it must match a previously registered - implementation using the ``register_parallel_backend`` function. + implementation using the :func:`~register_parallel_backend` function. By default the following backends are available: @@ -135,7 +148,9 @@ class parallel_backend(object): 'threading' is a low-overhead alternative that is most efficient for functions that release the Global Interpreter Lock: e.g. I/O-bound code or CPU-bound code in a few calls to native code that explicitly releases the - GIL. + GIL. Note that on some rare systems (such as pyiodine), + multiprocessing and loky may not be available, in which case joblib + defaults to threading. In addition, if the `dask` and `distributed` Python packages are installed, it is possible to use the 'dask' backend for better scheduling of nested @@ -158,9 +173,9 @@ class parallel_backend(object): caller passes an explicit value for the ``n_jobs`` parameter. This is an alternative to passing a ``backend='backend_name'`` argument to - the ``Parallel`` class constructor. It is particularly useful when calling - into library code that uses joblib internally but does not expose the - backend argument in its own API. + the :class:`~Parallel` class constructor. It is particularly useful when + calling into library code that uses joblib internally but does not expose + the backend argument in its own API. >>> from operator import neg >>> with parallel_backend('threading'): @@ -184,9 +199,20 @@ class parallel_backend(object): def __init__(self, backend, n_jobs=-1, inner_max_num_threads=None, **backend_params): if isinstance(backend, str): - if backend not in BACKENDS and backend in EXTERNAL_BACKENDS: - register = EXTERNAL_BACKENDS[backend] - register() + if backend not in BACKENDS: + if backend in EXTERNAL_BACKENDS: + register = EXTERNAL_BACKENDS[backend] + register() + elif backend in MAYBE_AVAILABLE_BACKENDS: + warnings.warn( + f"joblib backend '{backend}' is not available on " + f"your system, falling back to {DEFAULT_BACKEND}.", + UserWarning, + stacklevel=2) + BACKENDS[backend] = BACKENDS[DEFAULT_BACKEND] + else: + raise ValueError("Invalid backend: %s, expected one of %r" + % (backend, sorted(BACKENDS.keys()))) backend = BACKENDS[backend](**backend_params) @@ -364,8 +390,8 @@ def register_parallel_backend(name, factory, make_default=False): """Register a new Parallel backend factory. The new backend can then be selected by passing its name as the backend - argument to the Parallel class. Moreover, the default backend can be - overwritten globally by setting make_default=True. + argument to the :class:`~Parallel` class. Moreover, the default backend can + be overwritten globally by setting make_default=True. The factory can be any callable that takes no argument and return an instance of ``ParallelBackendBase``. @@ -428,15 +454,17 @@ class Parallel(Logger): CPUs but one are used. None is a marker for 'unset' that will be interpreted as n_jobs=1 (sequential execution) unless the call is performed under a - parallel_backend context manager that sets another value for - n_jobs. + :func:`~parallel_backend` context manager that sets another value + for n_jobs. backend: str, ParallelBackendBase instance or None, default: 'loky' Specify the parallelization backend implementation. Supported backends are: - "loky" used by default, can induce some communication and memory overhead when exchanging input and - output data with the worker Python processes. + output data with the worker Python processes. On some rare + systems (such as Pyiodide), the loky backend may not be + available. - "multiprocessing" previous process-based backend based on `multiprocessing.Pool`. Less robust than `loky`. - "threading" is a very low-overhead backend but it suffers @@ -447,18 +475,18 @@ class Parallel(Logger): in a "with nogil" block or an expensive call to a library such as NumPy). - finally, you can register backends by calling - register_parallel_backend. This will allow you to implement - a backend of your liking. + :func:`~register_parallel_backend`. This will allow you to + implement a backend of your liking. It is not recommended to hard-code the backend name in a call to - Parallel in a library. Instead it is recommended to set soft hints - (prefer) or hard constraints (require) so as to make it possible - for library users to change the backend from the outside using the - parallel_backend context manager. + :class:`~Parallel` in a library. Instead it is recommended to set + soft hints (prefer) or hard constraints (require) so as to make it + possible for library users to change the backend from the outside + using the :func:`~parallel_backend` context manager. prefer: str in {'processes', 'threads'} or None, default: None Soft hint to choose the default backend if no specific backend - was selected with the parallel_backend context manager. The - default process-based backend is 'loky' and the default + was selected with the :func:`~parallel_backend` context manager. + The default process-based backend is 'loky' and the default thread-based backend is 'threading'. Ignored if the ``backend`` parameter is specified. require: 'sharedmem' or None, default None @@ -477,7 +505,9 @@ class Parallel(Logger): pre_dispatch: {'all', integer, or expression, as in '3*n_jobs'} The number of batches (of tasks) to be pre-dispatched. Default is '2*n_jobs'. When batch_size="auto" this is reasonable - default and the workers should never starve. + default and the workers should never starve. Note that only basic + arithmetics are allowed here and no modules can be used in this + expression. batch_size: int or 'auto', default: 'auto' The number of atomic tasks to dispatch at once to each worker. When individual evaluations are very fast, dispatching @@ -513,9 +543,11 @@ class Parallel(Logger): in Bytes, or a human-readable string, e.g., '1M' for 1 megabyte. Use None to disable memmapping of large arrays. Only active when backend="loky" or "multiprocessing". - mmap_mode: {None, 'r+', 'r', 'w+', 'c'} - Memmapping mode for numpy arrays passed to workers. - See 'max_nbytes' parameter documentation for more details. + mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, default: 'r' + Memmapping mode for numpy arrays passed to workers. None will + disable memmapping, other modes defined in the numpy.memmap doc: + https://numpy.org/doc/stable/reference/generated/numpy.memmap.html + Also, see 'max_nbytes' parameter documentation for more details. Notes ----- @@ -688,6 +720,16 @@ def __init__(self, n_jobs=None, backend=None, verbose=0, timeout=None, # preload modules on the forkserver helper process. self._backend_args['context'] = backend backend = MultiprocessingBackend(nesting_level=nesting_level) + + elif backend not in BACKENDS and backend in MAYBE_AVAILABLE_BACKENDS: + warnings.warn( + f"joblib backend '{backend}' is not available on " + f"your system, falling back to {DEFAULT_BACKEND}.", + UserWarning, + stacklevel=2) + BACKENDS[backend] = BACKENDS[DEFAULT_BACKEND] + backend = BACKENDS[DEFAULT_BACKEND](nesting_level=nesting_level) + else: try: backend_factory = BACKENDS[backend] @@ -1010,7 +1052,9 @@ def _batched_calls_reducer_callback(): else: self._original_iterator = iterator if hasattr(pre_dispatch, 'endswith'): - pre_dispatch = eval(pre_dispatch) + pre_dispatch = eval_expr( + pre_dispatch.replace("n_jobs", str(n_jobs)) + ) self._pre_dispatch_amount = pre_dispatch = int(pre_dispatch) # The main thread will consume the first pre_dispatch items and diff --git a/joblib/test/data/create_numpy_pickle.py b/joblib/test/data/create_numpy_pickle.py index 0128f91ed..3a3a311fe 100644 --- a/joblib/test/data/create_numpy_pickle.py +++ b/joblib/test/data/create_numpy_pickle.py @@ -64,7 +64,7 @@ def write_test_pickle(to_pickle, args): print("Error: cannot generate file '{}' with arguments '{}'. " "Error was: {}".format(pickle_filename, kwargs, e)) else: - print("File '{}' generated successfuly.".format(pickle_filename)) + print("File '{}' generated successfully.".format(pickle_filename)) if __name__ == '__main__': import argparse diff --git a/joblib/test/test_cloudpickle_wrapper.py b/joblib/test/test_cloudpickle_wrapper.py new file mode 100644 index 000000000..733f51c72 --- /dev/null +++ b/joblib/test/test_cloudpickle_wrapper.py @@ -0,0 +1,27 @@ +""" +Test that our implementation of wrap_non_picklable_objects mimics +properly the loky implementation. +""" + +from .._cloudpickle_wrapper import wrap_non_picklable_objects +from .._cloudpickle_wrapper import my_wrap_non_picklable_objects + + +def a_function(x): + return x + + +class AClass(object): + + def __call__(self, x): + return x + + +def test_wrap_non_picklable_objects(): + # Mostly a smoke test: test that we can use callable in the same way + # with both our implementation of wrap_non_picklable_objects and the + # upstream one + for obj in (a_function, AClass()): + wrapped_obj = wrap_non_picklable_objects(obj) + my_wrapped_obj = my_wrap_non_picklable_objects(obj) + assert wrapped_obj(1) == my_wrapped_obj(1) diff --git a/joblib/test/test_dask.py b/joblib/test/test_dask.py index feb112040..9f072a128 100644 --- a/joblib/test/test_dask.py +++ b/joblib/test/test_dask.py @@ -1,5 +1,6 @@ from __future__ import print_function, division, absolute_import import os +import warnings import pytest from random import random @@ -11,6 +12,7 @@ from .._dask import DaskDistributedBackend distributed = pytest.importorskip('distributed') +dask = pytest.importorskip('dask') from distributed import Client, LocalCluster, get_client from distributed.metrics import time from distributed.utils_test import cluster, inc @@ -114,7 +116,7 @@ def f(dask_scheduler): def test_no_undesired_distributed_cache_hit(loop): # Dask has a pickle cache for callables that are called many times. Because - # the dask backends used to wrapp both the functions and the arguments + # the dask backends used to wrap both the functions and the arguments # under instances of the Batch callable class this caching mechanism could # lead to bugs as described in: https://github.com/joblib/joblib/pull/1055 # The joblib-dask backend has been refactored to avoid bundling the @@ -462,3 +464,28 @@ def test_wait_for_workers_timeout(): finally: client.close() cluster.close() + + +@pytest.mark.parametrize("backend", ["loky", "multiprocessing"]) +def test_joblib_warning_inside_dask_daemonic_worker(backend): + cluster = LocalCluster(n_workers=2) + client = Client(cluster) + + def func_using_joblib_parallel(): + # Somehow trying to check the warning type here (e.g. with + # pytest.warns(UserWarning)) make the test hang. Work-around: return + # the warning record to the client and the warning check is done + # client-side. + with warnings.catch_warnings(record=True) as record: + Parallel(n_jobs=2, backend=backend)( + delayed(inc)(i) for i in range(10)) + + return record + + fut = client.submit(func_using_joblib_parallel) + record = fut.result() + + assert len(record) == 1 + warning = record[0].message + assert isinstance(warning, UserWarning) + assert "distributed.worker.daemon" in str(warning) diff --git a/joblib/test/test_deprecated_objects.py b/joblib/test/test_deprecated_objects.py index d561483ee..9ca6b0882 100644 --- a/joblib/test/test_deprecated_objects.py +++ b/joblib/test/test_deprecated_objects.py @@ -2,15 +2,12 @@ Tests making sure that deprecated objects properly raise a deprecation warning when imported/created. """ -import sys - import pytest from joblib.my_exceptions import _deprecated_names as _deprecated_exceptions from joblib.format_stack import _deprecated_names as _deprecated_format_utils -@pytest.mark.xfail(sys.version_info < (3, 7), reason="no module-level getattr") def test_deprecated_joblib_exceptions(): assert 'JoblibException' in _deprecated_exceptions for name in _deprecated_exceptions: @@ -20,7 +17,6 @@ def test_deprecated_joblib_exceptions(): exec('from joblib.my_exceptions import {}'.format(name)) -@pytest.mark.xfail(sys.version_info < (3, 7), reason="no module-level getattr") def test_deprecated_formatting_utilities(capsys): assert 'safe_repr' in _deprecated_format_utils assert 'eq_repr' in _deprecated_format_utils diff --git a/joblib/test/test_hashing.py b/joblib/test/test_hashing.py index 37d9480ac..3a3d6316c 100644 --- a/joblib/test/test_hashing.py +++ b/joblib/test/test_hashing.py @@ -64,7 +64,7 @@ def f(self, x): class KlassWithCachedMethod(object): def __init__(self, cachedir): - mem = Memory(cachedir=cachedir) + mem = Memory(location=cachedir) self.f = mem.cache(self.f) def f(self, x): @@ -260,8 +260,8 @@ def test_numpy_scalar(): def test_dict_hash(tmpdir): - # Check that dictionaries hash consistently, eventhough the ordering - # of the keys is not garanteed + # Check that dictionaries hash consistently, even though the ordering + # of the keys is not guaranteed k = KlassWithCachedMethod(tmpdir.strpath) d = {'#s12069__c_maps.nii.gz': [33], diff --git a/joblib/test/test_memmapping.py b/joblib/test/test_memmapping.py index dc40d23f8..bdc825f06 100644 --- a/joblib/test/test_memmapping.py +++ b/joblib/test/test_memmapping.py @@ -9,6 +9,8 @@ import subprocess import threading +import pytest + from joblib.test.common import with_numpy, np from joblib.test.common import setup_autokill from joblib.test.common import teardown_autokill @@ -83,7 +85,7 @@ def test_memmap_based_array_reducing(tmpdir): buffer[:] = - 1.0 * np.arange(buffer.shape[0], dtype=buffer.dtype) buffer.flush() - # Memmap a 2D fortran array on a offseted subsection of the previous + # Memmap a 2D fortran array on a offsetted subsection of the previous # buffer a = np.memmap(filename, dtype=np.float64, shape=(3, 5, 4), mode='r+', order='F', offset=4) @@ -146,7 +148,8 @@ def reconstruct_array_or_memmap(x): assert_array_equal(b3_reconstructed, b3) -@skipif(sys.platform != "win32", +@with_multiprocessing +@skipif((sys.platform != "win32") or (), reason="PermissionError only easily triggerable on Windows") def test_resource_tracker_retries_when_permissionerror(tmpdir): # Test resource_tracker retry mechanism when unlinking memmaps. See more @@ -355,6 +358,7 @@ def test_pool_with_memmap_array_view(factory, tmpdir): @with_numpy +@with_multiprocessing @parametrize("backend", ["multiprocessing", "loky"]) def test_permission_error_windows_reference_cycle(backend): # Non regression test for: @@ -389,6 +393,7 @@ def test_permission_error_windows_reference_cycle(backend): @with_numpy +@with_multiprocessing @parametrize("backend", ["multiprocessing", "loky"]) def test_permission_error_windows_memmap_sent_to_parent(backend): # Second non-regression test for: @@ -581,39 +586,6 @@ def parallel_raise(array, temp_dirs): assert b"resource_tracker" not in err, err.decode() -@with_numpy -@with_multiprocessing -def test_nested_loop_error_in_grandchild_resource_tracker_silent(): - # Safety smoke test: test that nested parallel calls using the loky backend - # don't yield noisy resource_tracker outputs when the grandchild errors - # out. - cmd = '''if 1: - from joblib import Parallel, delayed - - - def raise_error(i): - raise ValueError - - - def nested_loop(f): - Parallel(backend="loky", n_jobs=2)( - delayed(f)(i) for i in range(10) - ) - - - if __name__ == "__main__": - Parallel(backend="loky", n_jobs=2)( - delayed(nested_loop)(func) for func in [raise_error] - ) - ''' - p = subprocess.Popen([sys.executable, '-c', cmd], - stderr=subprocess.PIPE, stdout=subprocess.PIPE) - p.wait() - out, err = p.communicate() - assert p.returncode == 1, out.decode() - assert b"resource_tracker" not in err, err.decode() - - @with_numpy @with_multiprocessing @parametrize("backend", ["multiprocessing", "loky"]) @@ -641,29 +613,25 @@ def test_many_parallel_calls_on_same_object(backend): delayed(return_slice_of_data)(data, 0, 20) for _ in range(10) ) - slice_of_data = Parallel( - n_jobs=2, max_nbytes=1, backend='{b}')( - delayed(return_slice_of_data)(data, 0, 20) - for _ in range(10) - ) '''.format(b=backend) - - for _ in range(3): - env = os.environ.copy() - env['PYTHONPATH'] = os.path.dirname(__file__) - p = subprocess.Popen([sys.executable, '-c', cmd], - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, env=env) - p.wait() - out, err = p.communicate() - assert p.returncode == 0, err - assert out == b'' - if sys.version_info[:3] not in [(3, 8, 0), (3, 8, 1)]: - # In early versions of Python 3.8, a reference leak - # https://github.com/cloudpipe/cloudpickle/issues/327, holds - # references to pickled objects, generating race condition during - # cleanup finalizers of joblib and noisy resource_tracker outputs. - assert b'resource_tracker' not in err + env = os.environ.copy() + env['PYTHONPATH'] = os.path.dirname(__file__) + p = subprocess.Popen( + [sys.executable, '-c', cmd], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + env=env, + ) + p.wait() + out, err = p.communicate() + assert p.returncode == 0, err + assert out == b'' + if sys.version_info[:3] not in [(3, 8, 0), (3, 8, 1)]: + # In early versions of Python 3.8, a reference leak + # https://github.com/cloudpipe/cloudpickle/issues/327, holds + # references to pickled objects, generating race condition during + # cleanup finalizers of joblib and noisy resource_tracker outputs. + assert b'resource_tracker' not in err @with_numpy @@ -679,7 +647,7 @@ def test_memmap_returned_as_regular_array(backend): @with_numpy @with_multiprocessing -@parametrize("backend", ["multiprocessing", param("loky", marks=xfail)]) +@parametrize("backend", ["multiprocessing", "loky"]) def test_resource_tracker_silent_when_reference_cycles(backend): # There is a variety of reasons that can make joblib with loky backend # output noisy warnings when a reference cycle is preventing a memmap from @@ -687,10 +655,22 @@ def test_resource_tracker_silent_when_reference_cycles(backend): # deletes the temporary folder if it was not done before, which can # interact badly with the resource_tracker. We don't risk leaking any # resources, but this will likely make joblib output a lot of low-level - # confusing messages. This test is marked as xfail for now: but a next PR - # should fix this behavior. + # confusing messages. + # + # This test makes sure that the resource_tracker is silent when a reference + # has been collected concurrently on non-Windows platforms. + # # Note that the script in ``cmd`` is the exact same script as in # test_permission_error_windows_reference_cycle. + if backend == "loky" and sys.platform.startswith('win'): + # XXX: on Windows, reference cycles can delay timely garbage collection + # and make it impossible to properly delete the temporary folder in the + # main process because of permission errors. + pytest.xfail( + "The temporary folder cannot be deleted on Windows in the " + "presence of a reference cycle" + ) + cmd = """if 1: import numpy as np from joblib import Parallel, delayed @@ -714,8 +694,10 @@ def test_resource_tracker_silent_when_reference_cycles(backend): stdout=subprocess.PIPE) p.wait() out, err = p.communicate() - assert p.returncode == 0, out.decode() - assert b"resource_tracker" not in err, err.decode() + out = out.decode() + err = err.decode() + assert p.returncode == 0, out + "\n\n" + err + assert "resource_tracker" not in err, err @with_numpy @@ -728,7 +710,7 @@ def test_memmapping_pool_for_large_arrays(factory, tmpdir): # Check that the tempfolder is empty assert os.listdir(tmpdir.strpath) == [] - # Build an array reducers that automaticaly dump large array content + # Build an array reducers that automatically dump large array content # to filesystem backed memmap instances to avoid memory explosion p = factory(3, max_nbytes=40, temp_folder=tmpdir.strpath, verbose=2) try: @@ -776,7 +758,18 @@ def test_memmapping_pool_for_large_arrays(factory, tmpdir): @with_numpy @with_multiprocessing -@parametrize("backend", ["multiprocessing", "loky"]) +@parametrize( + "backend", + [ + pytest.param( + "multiprocessing", + marks=pytest.mark.xfail( + reason='https://github.com/joblib/joblib/issues/1086' + ), + ), + "loky", + ] +) def test_child_raises_parent_exits_cleanly(backend): # When a task executed by a child process raises an error, the parent # process's backend is notified, and calls abort_everything. @@ -794,6 +787,8 @@ def test_child_raises_parent_exits_cleanly(backend): # - the resource_tracker does not emit any warnings. cmd = """if 1: import os + from pathlib import Path + from time import sleep import numpy as np from joblib import Parallel, delayed @@ -801,12 +796,11 @@ def test_child_raises_parent_exits_cleanly(backend): data = np.random.rand(1000) - def get_temp_folder(parallel_obj, backend): if "{b}" == "loky": - return p._backend._workers._temp_folder + return Path(p._backend._workers._temp_folder) else: - return p._backend._pool._temp_folder + return Path(p._backend._pool._temp_folder) if __name__ == "__main__": @@ -815,10 +809,27 @@ def get_temp_folder(parallel_obj, backend): temp_folder = get_temp_folder(p, "{b}") p(delayed(print_filename_and_raise)(data) for i in range(1)) - except ValueError: + except ValueError as e: # the temporary folder should be deleted by the end of this - # call - assert not os.path.exists(temp_folder) + # call but apparently on some file systems, this takes + # some time to be visible. + # + # We attempt to write into the temporary folder to test for + # its existence and we wait for a maximum of 10 seconds. + for i in range(100): + try: + with open(temp_folder / "some_file.txt", "w") as f: + f.write("some content") + except FileNotFoundError: + # temp_folder has been deleted, all is fine + break + + # ... else, wait a bit and try again + sleep(.1) + else: + raise AssertionError( + str(temp_folder) + " was not deleted" + ) from e """.format(b=backend) env = os.environ.copy() env['PYTHONPATH'] = os.path.dirname(__file__) @@ -828,7 +839,7 @@ def get_temp_folder(parallel_obj, backend): out, err = p.communicate() out, err = out.decode(), err.decode() filename = out.split('\n')[0] - assert p.returncode == 0, out + assert p.returncode == 0, err or out assert err == '' # no resource_tracker warnings. assert not os.path.exists(filename) @@ -951,7 +962,7 @@ def test_memmapping_pool_for_large_arrays_in_return(factory, tmpdir): """Check that large arrays are not copied in memory in return""" assert_array_equal = np.testing.assert_array_equal - # Build an array reducers that automaticaly dump large array content + # Build an array reducers that automatically dump large array content # but check that the returned datastructure are regular arrays to avoid # passing a memmap array pointing to a pool controlled temp folder that # might be confusing to the user diff --git a/joblib/test/test_memory.py b/joblib/test/test_memory.py index ad0ddf4ed..aaa7d1695 100644 --- a/joblib/test/test_memory.py +++ b/joblib/test/test_memory.py @@ -6,6 +6,7 @@ # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. +import functools import gc import shutil import os @@ -187,7 +188,7 @@ def f(x): if call_before_reducing: cached_f(3) # Two files were just created, func_code.py, and a folder - # containing the informations (inputs hash/ouptput) of + # containing the information (inputs hash/ouptput) of # cached_f(3) assert len(os.listdir(f_cache_directory / 'f')) == 2 @@ -364,7 +365,7 @@ def test_memory_eval(tmpdir): def count_and_append(x=[]): """ A function with a side effect in its arguments. - Return the lenght of its argument and append one element. + Return the length of its argument and append one element. """ len_x = len(x) x.append(None) @@ -488,6 +489,32 @@ def z(x, y=1): assert len(accumulator) == 1 +def test_memory_ignore_decorated(tmpdir): + " Test the ignore feature of memory on a decorated function " + memory = Memory(location=tmpdir.strpath, verbose=0) + accumulator = list() + + def decorate(f): + @functools.wraps(f) + def wrapped(*args, **kwargs): + return f(*args, **kwargs) + return wrapped + + @memory.cache(ignore=['y']) + @decorate + def z(x, y=1): + accumulator.append(1) + + assert z.ignore == ['y'] + + z(0, y=1) + assert len(accumulator) == 1 + z(0, y=1) + assert len(accumulator) == 1 + z(0, y=2) + assert len(accumulator) == 1 + + def test_memory_args_as_kwargs(tmpdir): """Non-regression test against 0.12.0 changes. @@ -537,10 +564,6 @@ def test_func_dir(tmpdir): assert location == path assert os.path.exists(path) assert memory.location == os.path.dirname(g.store_backend.location) - with warns(DeprecationWarning) as w: - assert memory.cachedir == g.store_backend.location - assert len(w) == 1 - assert "The 'cachedir' attribute has been deprecated" in str(w[-1].message) # Test that the code is stored. # For the following test to be robust to previous execution, we clear @@ -582,6 +605,19 @@ def test_persistence(tmpdir): gp(1) +def test_check_call_in_cache(tmpdir): + for func in (MemorizedFunc(f, tmpdir.strpath), + Memory(location=tmpdir.strpath, verbose=0).cache(f)): + result = func.check_call_in_cache(2) + assert not result + assert isinstance(result, bool) + assert func(2) == 5 + result = func.check_call_in_cache(2) + assert result + assert isinstance(result, bool) + func.clear() + + def test_call_and_shelve(tmpdir): # Test MemorizedFunc outputting a reference to cache. @@ -942,11 +978,14 @@ def test_memory_reduce_size(tmpdir): def test_memory_clear(tmpdir): - memory, _, _ = _setup_toy_cache(tmpdir) + memory, _, g = _setup_toy_cache(tmpdir) memory.clear() assert os.listdir(memory.store_backend.location) == [] + # Check that the cache for functions hash is also reset. + assert not g._check_previous_func_code(stacklevel=4) + def fast_func_with_complex_output(): complex_obj = ['a' * 1000] * 1000 @@ -1049,31 +1088,8 @@ def func(arg): assert message in str(e.args) -def test_deprecated_cachedir_behaviour(tmpdir): - # verify the right deprecation warnings are raised when using cachedir - # option instead of new location parameter. - with warns(None) as w: - memory = Memory(cachedir=tmpdir.strpath, verbose=0) - assert memory.store_backend.location.startswith(tmpdir.strpath) - - assert len(w) == 1 - assert "The 'cachedir' parameter has been deprecated" in str(w[-1].message) - - with warns(None) as w: - memory = Memory() - assert memory.cachedir is None - - assert len(w) == 1 - assert "The 'cachedir' attribute has been deprecated" in str(w[-1].message) - - error_regex = """You set both "location='.+ and "cachedir='.+""" - with raises(ValueError, match=error_regex): - memory = Memory(location=tmpdir.strpath, cachedir=tmpdir.strpath, - verbose=0) - - class IncompleteStoreBackend(StoreBackendBase): - """This backend cannot be instanciated and should raise a TypeError.""" + """This backend cannot be instantiated and should raise a TypeError.""" pass @@ -1130,7 +1146,7 @@ def test_register_invalid_store_backends_object(): def test_memory_default_store_backend(): - # test an unknow backend falls back into a FileSystemStoreBackend + # test an unknown backend falls back into a FileSystemStoreBackend with raises(TypeError) as excinfo: Memory(location='/tmp/joblib', backend='unknown') excinfo.match(r"Unknown location*") @@ -1144,7 +1160,7 @@ class NonSupportedLocationClass: with warns(UserWarning) as warninfo: _store_backend_factory("local", location=unsupported_location) - expected_mesage = ("Instanciating a backend using a " + expected_mesage = ("Instantiating a backend using a " "NonSupportedLocationClass as a location is not " "supported by joblib") assert expected_mesage in str(warninfo[0].message) @@ -1152,7 +1168,7 @@ class NonSupportedLocationClass: def test_instanciate_incomplete_store_backend(): # Verify that registering an external incomplete store backend raises an - # exception when one tries to instanciate it. + # exception when one tries to instantiate it. backend_name = "isb" register_store_backend(backend_name, IncompleteStoreBackend) assert (backend_name, IncompleteStoreBackend) in _STORE_BACKENDS.items() @@ -1174,7 +1190,7 @@ def test_dummy_store_backend(): def test_instanciate_store_backend_with_pathlib_path(): - # Instanciate a FileSystemStoreBackend using a pathlib.Path object + # Instantiate a FileSystemStoreBackend using a pathlib.Path object path = pathlib.Path("some_folder") backend_obj = _store_backend_factory("local", path) assert backend_obj.location == "some_folder" @@ -1191,7 +1207,7 @@ def test_filesystem_store_backend_repr(tmpdir): assert str(backend) == repr_pattern.format(location=None) - # backend location is passed explicitely via the configure method (called + # backend location is passed explicitly via the configure method (called # by the internal _store_backend_factory function) backend.configure(tmpdir.strpath) diff --git a/joblib/test/test_missing_multiprocessing.py b/joblib/test/test_missing_multiprocessing.py new file mode 100644 index 000000000..251925ced --- /dev/null +++ b/joblib/test/test_missing_multiprocessing.py @@ -0,0 +1,32 @@ +""" +Pyodide and other single-threaded Python builds will be missing the +_multiprocessing module. Test that joblib still works in this environment. +""" + +import os +import subprocess +import sys + + +def test_missing_multiprocessing(tmp_path): + """ + Test that import joblib works even if _multiprocessing is missing. + + pytest has already imported everything from joblib. The most reasonable way + to test importing joblib with modified environment is to invoke a separate + Python process. This also ensures that we don't break other tests by + importing a bad `_multiprocessing` module. + """ + (tmp_path / "_multiprocessing.py").write_text( + 'raise ImportError("No _multiprocessing module!")' + ) + env = dict(os.environ) + # For subprocess, use current sys.path with our custom version of + # multiprocessing inserted. + env["PYTHONPATH"] = ":".join([str(tmp_path)] + sys.path) + subprocess.check_call( + [sys.executable, "-c", + "import joblib, math; " + "joblib.Parallel(n_jobs=1)(" + "joblib.delayed(math.sqrt)(i**2) for i in range(10))" + ], env=env) diff --git a/joblib/test/test_module.py b/joblib/test/test_module.py index 9c3b12b90..a2257a414 100644 --- a/joblib/test/test_module.py +++ b/joblib/test/test_module.py @@ -1,7 +1,7 @@ import sys import joblib -import pytest from joblib.testing import check_subprocess_call +from joblib.test.common import with_multiprocessing def test_version(): @@ -9,6 +9,7 @@ def test_version(): "There are no __version__ argument on the joblib module") +@with_multiprocessing def test_no_start_method_side_effect_on_import(): # check that importing joblib does not implicitly set the global # start_method for multiprocessing. @@ -22,6 +23,7 @@ def test_no_start_method_side_effect_on_import(): check_subprocess_call([sys.executable, '-c', code]) +@with_multiprocessing def test_no_semaphore_tracker_on_import(): # check that importing joblib does not implicitly spawn a resource tracker # or a semaphore tracker @@ -38,6 +40,7 @@ def test_no_semaphore_tracker_on_import(): check_subprocess_call([sys.executable, '-c', code]) +@with_multiprocessing def test_no_resource_tracker_on_import(): code = """if True: import joblib diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index db130b1f4..c9d1d5bdb 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -5,6 +5,7 @@ import random import re import io +import sys import warnings import gzip import zlib @@ -13,15 +14,18 @@ import socket from contextlib import closing import mmap +from pathlib import Path + try: import lzma except ImportError: lzma = None + import pytest from joblib.test.common import np, with_numpy, with_lz4, without_lz4 from joblib.test.common import with_memory_profiler, memory_used -from joblib.testing import parametrize, raises, SkipTest, warns +from joblib.testing import parametrize, raises, warns # numpy_pickle is not a drop-in replacement of pickle, as it takes # filenames instead of open files as arguments. @@ -30,6 +34,8 @@ from joblib.numpy_pickle_utils import _IO_BUFFER_SIZE from joblib.numpy_pickle_utils import _detect_compressor +from joblib.numpy_pickle_utils import _is_numpy_array_byte_order_mismatch +from joblib.numpy_pickle_utils import _ensure_native_byte_order from joblib.compressor import (_COMPRESSORS, _LZ4_PREFIX, CompressorWrapper, LZ4_NOT_INSTALLED_ERROR, BinaryZlibFile) @@ -146,21 +152,19 @@ def test_numpy_persistence(tmpdir, compress): # And finally, check that all the values are equal. np.testing.assert_array_equal(np.array(obj), np.array(obj_)) - # Now test with array subclasses - for obj in (np.matrix(np.zeros(10)), - np.memmap(filename + 'mmap', - mode='w+', shape=4, dtype=np.float)): - filenames = numpy_pickle.dump(obj, filename, compress=compress) - # All is cached in one file - assert len(filenames) == 1 + # Now test with an array subclass + obj = np.memmap(filename + 'mmap', mode='w+', shape=4, dtype=np.float64) + filenames = numpy_pickle.dump(obj, filename, compress=compress) + # All is cached in one file + assert len(filenames) == 1 - obj_ = numpy_pickle.load(filename) - if (type(obj) is not np.memmap and - hasattr(obj, '__array_prepare__')): - # We don't reconstruct memmaps - assert isinstance(obj_, type(obj)) + obj_ = numpy_pickle.load(filename) + if (type(obj) is not np.memmap and + hasattr(obj, '__array_prepare__')): + # We don't reconstruct memmaps + assert isinstance(obj_, type(obj)) - np.testing.assert_array_equal(obj_, obj) + np.testing.assert_array_equal(obj_, obj) # Test with an object containing multiple numpy arrays obj = ComplexTestObject() @@ -276,11 +280,13 @@ def test_compress_mmap_mode_warning(tmpdir): numpy_pickle.dump(a, this_filename, compress=1) with warns(UserWarning) as warninfo: numpy_pickle.load(this_filename, mmap_mode='r+') + warninfo = [w.message for w in warninfo] assert len(warninfo) == 1 - assert (str(warninfo[0].message) == - 'mmap_mode "%(mmap_mode)s" is not compatible with compressed ' - 'file %(filename)s. "%(mmap_mode)s" flag will be ignored.' % - {'filename': this_filename, 'mmap_mode': 'r+'}) + assert ( + str(warninfo[0]) == + 'mmap_mode "r+" is not compatible with compressed ' + f'file {this_filename}. "r+" flag will be ignored.' + ) @with_numpy @@ -292,7 +298,7 @@ def test_cache_size_warning(tmpdir, cache_size): a = rnd.random_sample((10, 2)) warnings.simplefilter("always") - with warns(None) as warninfo: + with warnings.catch_warnings(record=True) as warninfo: numpy_pickle.dump(a, filename, cache_size=cache_size) expected_nb_warnings = 1 if cache_size is not None else 0 assert len(warninfo) == expected_nb_warnings @@ -312,10 +318,8 @@ def test_memory_usage(tmpdir, compress): filename = tmpdir.join('test.pkl').strpath small_array = np.ones((10, 10)) big_array = np.ones(shape=100 * int(1e6), dtype=np.uint8) - small_matrix = np.matrix(small_array) - big_matrix = np.matrix(big_array) - for obj in (small_array, big_array, small_matrix, big_matrix): + for obj in (small_array, big_array): size = obj.nbytes / 1e6 obj_filename = filename + str(np.random.randint(0, 1000)) mem_used = memory_used(numpy_pickle.dump, @@ -341,11 +345,6 @@ def test_compressed_pickle_dump_and_load(tmpdir): np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), np.arange(256, dtype=np.uint8).tobytes(), - # np.matrix is a subclass of np.ndarray, here we want - # to verify this type of object is correctly unpickled - # among versions. - np.matrix([0, 1, 2], dtype=np.dtype('i8')), u"C'est l'\xe9t\xe9 !"] fname = tmpdir.join('temp.pkl.gz').strpath @@ -355,13 +354,14 @@ def test_compressed_pickle_dump_and_load(tmpdir): result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): + expected = _ensure_native_byte_order(expected) assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected -def _check_pickle(filename, expected_list): +def _check_pickle(filename, expected_list, mmap_mode=None): """Helper function to test joblib pickle content. Note: currently only pickles containing an iterable are supported @@ -376,24 +376,44 @@ def _check_pickle(filename, expected_list): py_version_used_for_writing, 4) if pickle_reading_protocol >= pickle_writing_protocol: try: - with warns(None) as warninfo: + with warnings.catch_warnings(record=True) as warninfo: warnings.simplefilter('always') warnings.filterwarnings( 'ignore', module='numpy', message='The compiler package is deprecated') - result_list = numpy_pickle.load(filename) + result_list = numpy_pickle.load(filename, mmap_mode=mmap_mode) filename_base = os.path.basename(filename) - expected_nb_warnings = 1 if ("_0.9" in filename_base or - "_0.8.4" in filename_base) else 0 + expected_nb_deprecation_warnings = 1 if ( + "_0.9" in filename_base or "_0.8.4" in filename_base) else 0 + + expected_nb_user_warnings = 3 if ( + re.search("_0.1.+.pkl$", filename_base) and + mmap_mode is not None) else 0 + expected_nb_warnings = \ + expected_nb_deprecation_warnings + expected_nb_user_warnings assert len(warninfo) == expected_nb_warnings - for w in warninfo: - assert w.category == DeprecationWarning + + deprecation_warnings = [ + w for w in warninfo if issubclass( + w.category, DeprecationWarning)] + user_warnings = [ + w for w in warninfo if issubclass( + w.category, UserWarning)] + for w in deprecation_warnings: assert (str(w.message) == "The file '{0}' has been generated with a joblib " "version less than 0.10. Please regenerate this " "pickle file.".format(filename)) + + for w in user_warnings: + escaped_filename = re.escape(filename) + assert re.search( + f"memmapped.+{escaped_filename}.+segmentation fault", + str(w.message)) + for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): + expected = _ensure_native_byte_order(expected) assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: @@ -457,6 +477,68 @@ def test_joblib_pickle_across_python_versions(): _check_pickle(fname, expected_list) +@with_numpy +def test_joblib_pickle_across_python_versions_with_mmap(): + expected_list = [np.arange(5, dtype=np.dtype('i8'), ('', '>f8')]), + np.arange(3, dtype=np.dtype('>i8')), + np.arange(3, dtype=np.dtype('>f8'))] + + # Verify the byteorder mismatch is correctly detected. + for array in be_arrays: + if sys.byteorder == 'big': + assert not _is_numpy_array_byte_order_mismatch(array) + else: + assert _is_numpy_array_byte_order_mismatch(array) + converted = _ensure_native_byte_order(array) + if converted.dtype.fields: + for f in converted.dtype.fields.values(): + f[0].byteorder == '=' + else: + assert converted.dtype.byteorder == "=" + + # List of numpy arrays with little endian byteorder. + le_arrays = [np.array([(1, 2.0), (3, 4.0)], + dtype=[('', ' 0: - return 'backed parallel loops cannot' in records[0].message.args[0] + # with threading, we might see more that one warninfo + if warninfo: + return ( + len(warninfo) == 1 and + 'backed parallel loops cannot' in warninfo[0].args[0] + ) return False else: - assert len(records) == 0 + assert not warninfo return True @with_multiprocessing @parametrize('parent_backend,child_backend,expected', [ - ('loky', 'multiprocessing', True), ('loky', 'loky', False), + ('loky', 'multiprocessing', True), + ('loky', 'loky', False), ('multiprocessing', 'multiprocessing', True), ('multiprocessing', 'loky', True), ('threading', 'multiprocessing', True), @@ -241,11 +253,11 @@ def test_background_thread_parallelism(backend): is_run_parallel = [False] def background_thread(is_run_parallel): - with warns(None) as records: + with warnings.catch_warnings(record=True) as warninfo: Parallel(n_jobs=2)( delayed(sleep)(.1) for _ in range(4)) - print(len(records)) - is_run_parallel[0] = len(records) == 0 + print(len(warninfo)) + is_run_parallel[0] = len(warninfo) == 0 t = threading.Thread(target=background_thread, args=(is_run_parallel,)) t.start() @@ -269,6 +281,7 @@ def raise_exception(backend): raise ValueError +@with_multiprocessing def test_nested_loop_with_exception_with_loky(): with raises(ValueError): with Parallel(n_jobs=2, backend="loky") as parallel: @@ -568,8 +581,14 @@ def effective_n_jobs(self, n_jobs=1): def test_invalid_backend(): - with raises(ValueError): + with raises(ValueError) as excinfo: Parallel(backend='unit-testing') + assert "Invalid backend:" in str(excinfo.value) + + with raises(ValueError) as excinfo: + with parallel_backend('unit-testing'): + pass + assert "Invalid backend:" in str(excinfo.value) @parametrize('backend', ALL_VALID_BACKENDS) @@ -600,6 +619,17 @@ def test_overwrite_default_backend(): assert _active_backend_type() == DefaultBackend +@skipif(mp is not None, reason="Only without multiprocessing") +def test_backend_no_multiprocessing(): + with warns(UserWarning, + match="joblib backend '.*' is not available on.*"): + Parallel(backend='loky')(delayed(square)(i) for i in range(3)) + + # The below should now work without problems + with parallel_backend('loky'): + Parallel()(delayed(square)(i) for i in range(3)) + + def check_backend_context_manager(backend_name): with parallel_backend(backend_name, n_jobs=3): active_backend, active_n_jobs = parallel.get_active_backend() @@ -1001,6 +1031,7 @@ def test_parallel_with_unpicklable_functions_in_args( INTERACTIVE_DEFINED_FUNCTION_AND_CLASS_SCRIPT_CONTENT = """\ import sys +import faulthandler # Make sure that joblib is importable in the subprocess launching this # script. This is needed in case we run the tests from the joblib root # folder without having installed joblib @@ -1025,6 +1056,9 @@ def square(x, ignored=None, ignored2=None): # Here, we do not need the `if __name__ == "__main__":` safeguard when # using the default `loky` backend (even on Windows). +# To make debugging easier +faulthandler.dump_traceback_later(30, exit=True) + # The following baroque function call is meant to check that joblib # introspection rightfully uses cloudpickle instead of the (faster) pickle # module of the standard library when necessary. In particular cloudpickle is @@ -1047,9 +1081,11 @@ def test_parallel_with_interactively_defined_functions_default_backend(tmpdir): # filesystem script. script = tmpdir.join('joblib_interactively_defined_function.py') script.write(INTERACTIVE_DEFINED_FUNCTION_AND_CLASS_SCRIPT_CONTENT) - check_subprocess_call([sys.executable, script.strpath], - stdout_regex=r'\[0, 1, 4, 9, 16\]', - timeout=5) + check_subprocess_call( + [sys.executable, script.strpath], + stdout_regex=r'\[0, 1, 4, 9, 16\]', + timeout=None, # rely on faulthandler to kill the process + ) INTERACTIVELY_DEFINED_SUBCLASS_WITH_METHOD_SCRIPT_CONTENT = """\ @@ -1146,7 +1182,7 @@ def test_memmap_with_big_offset(tmpdir): def test_warning_about_timeout_not_supported_by_backend(): - with warns(None) as warninfo: + with warnings.catch_warnings(record=True) as warninfo: Parallel(timeout=1)(delayed(square)(i) for i in range(50)) assert len(warninfo) == 1 w = warninfo[0] @@ -1207,7 +1243,10 @@ def test_memmapping_leaks(backend, tmpdir): raise AssertionError('temporary directory of Parallel was not removed') -@parametrize('backend', [None, 'loky', 'threading']) +@parametrize('backend', + ([None, 'threading'] if mp is None + else [None, 'loky', 'threading']) + ) def test_lambda_expression(backend): # cloudpickle is used to pickle delayed callables results = Parallel(n_jobs=2, backend=backend)( @@ -1237,6 +1276,7 @@ def test_backend_batch_statistics_reset(backend): p._backend._DEFAULT_SMOOTHED_BATCH_DURATION) +@with_multiprocessing def test_backend_hinting_and_constraints(): for n_jobs in [1, 2, -1]: assert type(Parallel(n_jobs=n_jobs)._backend) == LokyBackend @@ -1347,12 +1387,13 @@ def test_invalid_backend_hinting_and_constraints(): # requiring shared memory semantics. Parallel(prefer='processes', require='sharedmem') - # It is inconsistent to ask explictly for a process-based parallelism - # while requiring shared memory semantics. - with raises(ValueError): - Parallel(backend='loky', require='sharedmem') - with raises(ValueError): - Parallel(backend='multiprocessing', require='sharedmem') + if mp is not None: + # It is inconsistent to ask explicitly for a process-based + # parallelism while requiring shared memory semantics. + with raises(ValueError): + Parallel(backend='loky', require='sharedmem') + with raises(ValueError): + Parallel(backend='multiprocessing', require='sharedmem') def test_global_parallel_backend(): @@ -1437,7 +1478,8 @@ def _recursive_parallel(nesting_limit=None): return Parallel()(delayed(_recursive_parallel)() for i in range(2)) -@parametrize('backend', ['loky', 'threading']) +@parametrize('backend', + (['threading'] if mp is None else ['loky', 'threading'])) def test_thread_bomb_mitigation(backend): # Test that recursive parallelism raises a recursion rather than # saturating the operating system resources by creating a unbounded number @@ -1446,13 +1488,18 @@ def test_thread_bomb_mitigation(backend): with raises(BaseException) as excinfo: _recursive_parallel() exc = excinfo.value - if backend == "loky" and isinstance(exc, TerminatedWorkerError): - # The recursion exception can itself cause an error when pickling it to - # be send back to the parent process. In this case the worker crashes - # but the original traceback is still printed on stderr. This could be - # improved but does not seem simple to do and this is is not critical - # for users (as long as there is no process or thread bomb happening). - pytest.xfail("Loky worker crash when serializing RecursionError") + if backend == "loky": + # Local import because loky may not be importable for lack of + # multiprocessing + from joblib.externals.loky.process_executor import TerminatedWorkerError # noqa + if isinstance(exc, TerminatedWorkerError): + # The recursion exception can itself cause an error when + # pickling it to be send back to the parent process. In this + # case the worker crashes but the original traceback is still + # printed on stderr. This could be improved but does not seem + # simple to do and this is is not critical for users (as long + # as there is no process or thread bomb happening). + pytest.xfail("Loky worker crash when serializing RecursionError") else: assert isinstance(exc, RecursionError) @@ -1466,7 +1513,7 @@ def _run_parallel_sum(): return env_vars, parallel_sum(100) -@parametrize("backend", [None, 'loky']) +@parametrize("backend", ([None, 'loky'] if mp is not None else [None])) @skipif(parallel_sum is None, reason="Need OpenMP helper compiled") def test_parallel_thread_limit(backend): results = Parallel(n_jobs=2, backend=backend)( @@ -1563,7 +1610,7 @@ def _parent_max_num_threads_for(child_module, parent_info): def check_child_num_threads(workers_info, parent_info, num_threads): # Check that the number of threads reported in workers_info is consistent - # with the expectation. We need to be carefull to handle the cases where + # with the expectation. We need to be careful to handle the cases where # the requested number of threads is below max_num_thread for the library. for child_threadpool_info in workers_info: for child_module in child_threadpool_info: diff --git a/joblib/test/test_utils.py b/joblib/test/test_utils.py new file mode 100644 index 000000000..4999a212c --- /dev/null +++ b/joblib/test/test_utils.py @@ -0,0 +1,27 @@ +import pytest + +from joblib._utils import eval_expr + + +@pytest.mark.parametrize( + "expr", + ["exec('import os')", "print(1)", "import os", "1+1; import os", "1^1"], +) +def test_eval_expr_invalid(expr): + with pytest.raises( + ValueError, match="is not a valid or supported arithmetic" + ): + eval_expr(expr) + + +@pytest.mark.parametrize( + "expr, result", + [ + ("2*6", 12), + ("2**6", 64), + ("1 + 2*3**(4) / (6 + -7)", -161.0), + ("(20 // 3) % 5", 1), + ], +) +def test_eval_expr_valid(expr, result): + assert eval_expr(expr) == result diff --git a/joblib/testing.py b/joblib/testing.py index 28f79311c..f8939f056 100644 --- a/joblib/testing.py +++ b/joblib/testing.py @@ -50,9 +50,10 @@ def kill_process(): warnings.warn("Timeout running {}".format(cmd)) proc.kill() - timer = threading.Timer(timeout, kill_process) try: - timer.start() + if timeout is not None: + timer = threading.Timer(timeout, kill_process) + timer.start() stdout, stderr = proc.communicate() stdout, stderr = stdout.decode(), stderr.decode() if proc.returncode != 0: @@ -74,4 +75,5 @@ def kill_process(): stderr_regex, stderr)) finally: - timer.cancel() + if timeout is not None: + timer.cancel() diff --git a/setup.cfg b/setup.cfg index 6a31db944..e3dfb343e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,14 +4,13 @@ release = egg_info -RDb '' upload = upload upload_docs --upload-dir doc/_build/html [bdist_rpm] -doc-files = doc +doc_files = doc [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS addopts = --doctest-glob="doc/*.rst" --doctest-modules - -p no:warnings --ignore joblib/externals testpaths = joblib diff --git a/setup.py b/setup.py index d2794c2d5..d9c642f34 100755 --- a/setup.py +++ b/setup.py @@ -15,6 +15,9 @@ author='Gael Varoquaux', author_email='gael.varoquaux@normalesup.org', url='https://joblib.readthedocs.io', + project_urls={ + 'Source': 'https://github.com/joblib/joblib', + }, license='BSD', description="Lightweight pipelining with Python functions", long_description=long_description, @@ -28,10 +31,10 @@ 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', 'Topic :: Scientific/Engineering', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries', @@ -54,5 +57,5 @@ 'joblib.externals', 'joblib.externals.cloudpickle', 'joblib.externals.loky', 'joblib.externals.loky.backend', ], - python_requires='>=3.6', + python_requires='>=3.7', )