OMP_NUM_THREADS
MKL_NUM_THREADS
OPENBLAS_NUM_THREADS
OMP_NUM_THREADS
MKL_NUM_THREADS
OPENBLAS_NUM_THREADS
POLARS_MAX_THREADS
NUMBA_NUM_THREADS
VECLIB_MAXIMUM_THREADS
NUMEXPR_NUM_THREADS
torch.set_num_threads
numba.set_num_threads
threadpoolctl.threadpool_limits
cv.setNumThreads
threadpoolctl
from threadpoolctl import threadpool_limitsimport numpy as npwith threadpool_limits(limits=2): a = np.random.randn(1000, 1000) a_squared = a @ a
n_jobs
workers
num_workers
max_workers
export OMP_NUM_THREADS=1export MKL_NUM_THREADS=1export OPENBLAS_NUM_THREADS=1export POLARS_MAX_THREADS=1export NUMEXPR_NUM_THREADS=1
export OMP_NUM_THREADS=1export MKL_NUM_THREADS=1export OPENBLAS_NUM_THREADS=1export POLARS_MAX_THREADS=1export NUMEXPR_NUM_THREADS=1
export OMP_NUM_THREADS=1
export OMP_NUM_THREADS=1export MKL_NUM_THREADS=1export OPENBLAS_NUM_THREADS=1export POLARS_MAX_THREADS=1export NUMEXPR_NUM_THREADS=1
export OMP_NUM_THREADS=1
export GOTO_NUM_THREADS=1
threadpoolctl
n_jobs
workers
num_workers
max_workers
n_jobs
workers
num_workers
max_workers
workers
from scipy import optimizeoptimize.brute( computation_that_uses_8_cores, ... workers=8)
from torch.utils.data import DataLoaderdl = DataLoader(..., num_workers=8)# torch/utils/data/_utils/worker.pydef _worker_loop(...): ... torch.set_num_threads(1)
from sklearn.experimental import enable_halving_search_cvfrom sklearn.model_selection import HalvingGridSearchCVfrom sklearn.ensemble import HalvingRandomSearchCVclf = HistGradientBoostingClassifier()search = HalvingGridSearchCV( clf, param_distributions, n_jos=8)search.fit(X, y)
fork
+ GCC OpenMP: stallsPython multiprocessing using fork
+ GCC OpenMP: stalls
Intel OpenMP + LLVM OpenMP on Linux: stalls
Python multiprocessing using fork
+ GCC OpenMP: stalls
Intel OpenMP + LLVM OpenMP on Linux: stalls
Multiple OpenBLAS libraries: sometimes slower
Python multiprocessing using fork
+ GCC OpenMP: stalls
Intel OpenMP + LLVM OpenMP on Linux: stalls
Multiple OpenBLAS libraries: sometimes slower
Read more at: thomasjpfan.github.io/parallelism-python-libraries-design/
Sources: polars, numba, scikit-learn, pandas
for n_iter in range(100): UV = U @ V.T # Use OpenBLAS with pthreads compute_with_openmp(UV) # Use OpenMP
import numpy as npout = np.sum(A_array, axis=1)
import numpy as npout = np.sum(A_array, axis=1)
๐ One Core ๐
import numpy as npout = A_array @ B_array
import numpy as npout = A_array @ B_array
๐๏ธ All Cores ๐๏ธ
OMP_NUM_THREADS
out = A_array @ B_array
OMP_NUM_THREADS
out = A_array @ B_array
threadpoolctl
from threadpoolctl import threadpool_limitswith threadpool_limits(limits=1): out = A_array @ B_array
import torchout = torch.sum(A_tensor, axis=1)
import torchout = torch.sum(A_tensor, axis=1)
๐๏ธ All Cores ๐๏ธ
OMP_NUM_THREADS
threadpoolctl
with threadpool_limits(limits=2): out = torch.sum(A_tensor, axis=1)
OMP_NUM_THREADS
threadpoolctl
with threadpool_limits(limits=2): out = torch.sum(A_tensor, axis=1)
import torchtorch.set_num_threads(2)out = torch.sum(A_tensor, axis=1)
import pandas as pddf = pd.DataFrame(np.random.randn(10_000, 100))roll = df.rolling(100)out = roll.mean()
import pandas as pddf = pd.DataFrame(np.random.randn(10_000, 100))roll = df.rolling(100)out = roll.mean()
๐ One Core ๐
import pandas as pddf = pd.DataFrame(np.random.randn(10_000, 100))roll = df.rolling(100)out = roll.mean( engine="numba", engine_kwargs={"parallel": True},)
import pandas as pddf = pd.DataFrame(np.random.randn(10_000, 100))roll = df.rolling(100)out = roll.mean( engine="numba", engine_kwargs={"parallel": True},)
๐๏ธ All Cores ๐๏ธ
NUMBA_NUM_THREADS
Environment variable: NUMBA_NUM_THREADS
Numba function
import numbanumba.set_num_threads(2)out = roll.mean(engine="numba", engine_kwargs={"parallel": True})
from sklearn.linear_model import LogisticRegressionlog_reg = LogisticRegression().fit(...)log_reg.predict(X)
from sklearn.linear_model import LogisticRegressionlog_reg = LogisticRegression().fit(...)log_reg.predict(X)
๐๏ธ All Cores ๐๏ธ
OMP_NUM_THREADS
Environment variable: OMP_NUM_THREADS
threadpoolctl
with threadpool_limits(limits=2): log_reg.predict(X)
from sklearn.ensemble import HistGradientBoostingClassifierhist = HistGradientBoostingClassifier()hist.fit(X, y)
from sklearn.ensemble import HistGradientBoostingClassifierhist = HistGradientBoostingClassifier()hist.fit(X, y)
๐๏ธ All Cores ๐๏ธ
OMP_NUM_THREADS
Environment variable: OMP_NUM_THREADS
threadpoolctl
with threadpool_limits(limits=2): hist.predict(X)
out = ( pl.scan_csv(...) .filter(pl.col("sepal_length") > 5) .groupby("species") .agg(pl.col("sepal_width").mean()) .collect())
out = ( pl.scan_csv(...) .filter(pl.col("sepal_length") > 5) .groupby("species") .agg(pl.col("sepal_width").mean()) .collect())
๐๏ธ All Cores ๐๏ธ
POLARS_MAX_THREADS
out = ( pl.scan_csv(...) .filter(pl.col("sepal_length") > 5) ...)
Keyboard shortcuts
โ, โ, Pg Up, k | Go to previous slide |
โ, โ, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |