Add support for multihost pmaps.

skye · skye · commit dc2ee0de897c · 2019-09-26T14:44:16.000-07:00
All participating hosts are assumed to be running the same pmap
code. Conceptually, this can be considered a single pmap over an array
sharded on its leading pmapped dimension across the hosts. Each host
passes its input shard to its pmapped function call, which returns the
corresponding output shard (i.e. an array of the same leading
dimension size). However, any collective operations will be run across
the entire "global" array.

If the `devices` argument to pmap is None, the pmap is assumed to be
running across all hosts visible to XLA (as returned by
jax.host_count()). Each host can pass in an input array of leading
dimension size equal to or less than the number of devices local to
that host. Note that this doesn't change the current behavior for
single-host platforms. If `devices` are specified, the participating
hosts are dictated by the devices' host_ids, and each host must pass
in an input array of leading dim size equal to the number of local
participating devices.

Implementation-wise, each host independently compiles the computation,
which we assume yields the same executable on all hosts (follow-up
work will add more error checking). The hosts must know the global
axis size of the sharded array, e.g. to provide the correct replica
count to XLA. This is equal to the length of `devices` if specified,
but if not, pmap is recursively called (with `devices` specified) to
use `psum` to compute the global axis size.
diff --git a/jax/api.py b/jax/api.py
@@ -51,7 +51,7 @@
 from .util import (unzip2, unzip3, curry, partial, safe_map, safe_zip,
                    WrapHashably, Hashable, prod, split_list)
 from .lib.xla_bridge import (canonicalize_dtype, device_count,
-                             local_device_count, devices, host_id)
+                             local_device_count, devices, host_id, host_count)
 from .abstract_arrays import ShapedArray, raise_to_shaped
 from .interpreters import partial_eval as pe
 from .interpreters import xla
@@ -643,23 +643,36 @@ def pmap(fun, axis_name=None, devices=None, backend=None):
   pure maps, ``pmap`` enables the use of parallel SPMD collective operations,
   like all-reduce sum.
 
-  The mapped axis size must be less than or equal to the number of XLA devices
-  available (unless ``devices`` is specified, see below). For nested ``pmap``
-  calls, the product of the mapped axis sizes must be less than or equal to the
-  number of XLA devices.
+  The mapped axis size must be less than or equal to the number of local XLA
+  devices available, as returned by ``jax.local_device_count()`` (unless
+  ``devices`` is specified, see below). For nested ``pmap`` calls, the product
+  of the mapped axis sizes must be less than or equal to the number of XLA
+  devices.  TODO(skye): support < # local devices on multi-host platforms
+
+  **Multi-host platforms:** On multi-host platforms such as TPU pods, ``pmap``
+  is designed to be used in SPMD Python programs, where every host is running
+  the same Python code such that all hosts run the same pmapped function in the
+  same order. Each host should still call the pmapped function with mapped axis
+  size equal to the number of *local* devices (unless ``devices`` is specified,
+  see below), and an array of the same leading axis size will be returned as
+  usual. However, any collective operations in ``fun`` will be computed over
+  *all* participating devices, including those on other hosts, via
+  device-to-device communication.  Conceptually, this can be thought of as
+  running a pmap over a single array sharded across hosts, where each host
+  "sees" only its local shard of the input and output.
 
   Args:
     fun: Function to be mapped over argument axes.
     axis_name: Optional, a hashable Python object used to identify the mapped
       axis so that parallel collectives can be applied.
     devices: This is an experimental feature and the API is likely to change.
       Optional, a sequence of Devices to map over. (Available devices can be
-      retrieved via jax.devices()). If specified, the length of the sequence
-      must be equal to the size of the mapped axis. Nested ``pmap``s with
-      ``devices`` specified in either the inner or outer ``pmap`` are not yet
-      supported.
+      retrieved via jax.devices()). If specified, the size of the mapped axis
+      must be equal to the number of local devices in the sequence. Nested
+      ``pmap`` s with ``devices`` specified in either the inner or outer ``pmap``
+      are not yet supported.
     backend: This is an experimental feature and the API is likely to change.
-      Optional, a string representing the xla backend. 'cpu','gpu', or 'tpu'.
+      Optional, a string representing the xla backend. 'cpu', 'gpu', or 'tpu'.
 
   Returns:
     A parallelized version of ``fun`` with arguments that correspond to those of
@@ -721,10 +734,28 @@ def pmap(fun, axis_name=None, devices=None, backend=None):
   >>> print(doubly_normed.sum((0, 1)))
   1.0
 
+  On multi-host platforms, collective operations operate over all devices,
+  including those those on other hosts. For example, assuming the following code
+  runs on two hosts with 4 XLA devices each:
+
+  >>> f = lambda x: x + jax.lax.psum(x, axis_name='i')
+  >>> data = np.arange(4) if jax.host_id() == 0 else np.arange(4,8)
+  >>> out = pmap(f, axis_name='i')(data)
+  >>> print(out)
+  [28 29 30 31] # on host 0
+  [32 33 34 35] # on host 1
+
+  Each host passes in a different length-4 array, corresponding to its 4 local
+  devices, and the psum operates over all 8 values. Conceptually, the two
+  length-4 arrays can be thought of as sharded length-16 array (in this example
+  equivalent to np.arange(8)) that is mapped over, with the length-8 mapped axis
+  given name 'i'. The pmap call on each host then returns the corresponding
+  length-4 output shard.
+
   The ``devices`` argument can be used to specify exactly which devices are used
-  to run the parallel computation. For example, the following code defines
-  two parallel computations, one which runs on the first six devices and one on
-  the remaining two:
+  to run the parallel computation. For example, again assuming a single host
+  with 8 devices, the following code defines two parallel computations, one
+  which runs on the first six devices and one on the remaining two:
 
   >>> from functools import partial
   >>> @partial(pmap, axis_name='i', devices=jax.devices()[:6])
diff --git a/jax/interpreters/pxla.py b/jax/interpreters/pxla.py
@@ -22,6 +22,7 @@
 import operator as op
 import threading
 
+from absl import logging
 import numpy as onp
 import six
 from six.moves import reduce
@@ -172,32 +173,37 @@ def replica_groups(nrep, mesh_spec, mesh_axes):
 
 ### the main pmap machinery lowers SPMD jaxprs to multi-replica XLA computations
 
-def compile_replicated(jaxpr, backend, axis_name, axis_size, devices, consts,
-                       tuple_args, *abstract_args):
+def compile_replicated(jaxpr, backend, axis_name, axis_size, global_axis_size,
+                       devices, consts, tuple_args, *abstract_args):
+  jaxpr_replicas = xla.jaxpr_replicas(jaxpr)
+  num_local_replicas = axis_size * jaxpr_replicas
+  num_replicas = global_axis_size * jaxpr_replicas
+  logging.vlog(
+      1, "compile_replicated: axis_size=%d global_axis_size=%d jaxpr_replicas=%d"
+      % (axis_size, global_axis_size, jaxpr_replicas))
+
   if devices is None:
-    num_replicas = axis_size * xla.jaxpr_replicas(jaxpr)
     if num_replicas > xb.device_count(backend):
       msg = ("compiling computation that requires {} replicas, but only {} XLA "
              "devices are available")
       raise ValueError(msg.format(num_replicas, xb.device_count(backend)))
     device_assignment = None
   else:
-    assert all(d.host_id == xb.host_id() for d in devices)
-    if axis_size != len(devices):
+    assert any(d.host_id == xb.host_id() for d in devices)
+    if num_replicas != len(devices):
       raise ValueError("compiling computation that requires %s replicas, "
                        "but %s devices were specified"
-                       % (axis_size, len(devices)))
-    num_replicas = len(devices)
+                       % (num_replicas, len(devices)))
     device_assignment = tuple(d.id for d in devices)
 
-  axis_env = xla.AxisEnv(num_replicas, [axis_name], [axis_size], devices)
+  axis_env = xla.AxisEnv(num_replicas, [axis_name], [global_axis_size], devices)
   arg_shapes = list(map(aval_to_xla_shape, abstract_args))
   built_c = xla.jaxpr_computation(jaxpr, backend, axis_env, consts, (), arg_shapes,
                                   tuple_args=tuple_args)
   compiled = built_c.Compile(
       compile_options=xb.get_compile_options(num_replicas, device_assignment),
       backend=xb.get_backend(backend))
-  return compiled, num_replicas
+  return compiled, num_local_replicas
 
 
 ### applying parallel primitives in op-by-op Python dispatch
@@ -446,9 +452,24 @@ def parallel_callable(fun, backend, axis_name, axis_size, devices, *avals):
   pvals = [PartialVal((aval, core.unit)) for aval in avals]
   pval = PartialVal([core.abstract_unit, core.unit])  # dummy value
 
+  if devices:
+    global_axis_size = len(devices)
+  elif xb.host_count() > 1:
+    # TODO(skye): relax this constraint or provide functionality for
+    # automatically passing appropriate `devices`.
+    if axis_size != xb.local_device_count():
+      raise ValueError(
+          "On multi-host platforms, the input to pmapped functions must have "
+          "leading axis size equal to the number of local devices if no "
+          "`devices` argument is specified. Got axis_size=%d, "
+          "num_local_devices=%d" % (axis_size, xb.local_device_count()))
+    global_axis_size = xb.device_count()
+  else:
+    global_axis_size = axis_size
+
   @lu.wrap_init
   def dynamic_fun(dummy, *args):
-    with extend_dynamic_axis_env(axis_name, dummy.trace, axis_size):
+    with extend_dynamic_axis_env(axis_name, dummy.trace, global_axis_size):
       return fun.call_wrapped(*args)
 
   with core.new_master(JaxprTrace, True) as master:
@@ -470,7 +491,8 @@ def dynamic_fun(dummy, *args):
     # Condense many arguments into single tuple argument to avoid a TPU issue.
     tuple_args = len(avals) > 100
     compiled, nrep = compile_replicated(jaxpr, backend, axis_name, axis_size,
-                                        devices, consts, tuple_args, *avals)
+                                        global_axis_size, devices, consts,
+                                        tuple_args, *avals)
     device_ordinals = compiled.DeviceOrdinals()
     assignments = assign_shards_to_replicas(nrep, axis_size)
     handle_args = partial(shard_args, backend, device_ordinals, assignments,
@@ -520,7 +542,7 @@ def _pmap_translation_rule(c, jaxpr, axis_env, const_nodes, freevar_nodes,
   if axis_env.devices is not None or (axis_env.names and devices is not None):
     raise ValueError("Nested pmaps with explicit devices argument.")
   new_env = xla.extend_axis_env(axis_env, axis_name, axis_size)
-  in_nodes_sharded = list(map(partial(_xla_shard, c, new_env.sizes), in_nodes))
+  in_nodes_sharded = list(map(partial(_xla_shard, c), in_nodes))
   sharded_outs = xla.jaxpr_subcomp(c, jaxpr, backend, new_env, const_nodes,
                                    freevar_nodes, *in_nodes_sharded)
   outs = [_xla_unshard(c, xla.axis_groups(new_env, axis_name), r)
@@ -531,14 +553,13 @@ def _pmap_translation_rule(c, jaxpr, axis_env, const_nodes, freevar_nodes,
 ad.primitive_transposes[xla_pmap_p] = partial(ad.map_transpose, xla_pmap_p)
 pe.map_primitives.add(xla_pmap_p)
 
-def _xla_shard(c, sizes, x):
+def _xla_shard(c, x):
   xla_shape = c.GetShape(x)
   if xla_shape.is_tuple():
     assert not xla_shape.tuple_shapes()
     return x
   else:
     dims = list(xla_shape.dimensions())
-    assert dims[0] == sizes[-1]
     start_indices = _xla_shard_start_indices(c, dims[0], len(dims))
     return c.Reshape(c.DynamicSlice(x, start_indices, [1] + dims[1:]),
                     None, dims[1:])
diff --git a/jax/lib/xla_bridge.py b/jax/lib/xla_bridge.py
@@ -197,6 +197,10 @@ def host_id(backend=None):
   return get_backend(backend).host_id()
 
 
+def host_count():
+  return len(set(d.host_id for d in devices()))
+
+
 ### utility functions
 
 @util.memoize