ludwig-ai · tgaddair · Oct 18, 2020 · Oct 22, 2020 · Oct 22, 2020 · Oct 22, 2020
@@ -37,7 +37,7 @@
 from ludwig.backend import LOCAL_BACKEND, Backend, create_backend
 from ludwig.constants import FULL, PREPROCESSING, TEST, TRAINING, VALIDATION
 from ludwig.contrib import contrib_command
-from ludwig.data.dataset import Dataset
+from ludwig.data.dataset.base import Dataset
 from ludwig.data.postprocessing import convert_predictions, postprocess
 from ludwig.data.preprocessing import (load_metadata,
                                        preprocess_for_prediction,

@@ -25,7 +25,13 @@ def get_local_backend():
     return LOCAL_BACKEND
 
 
+def create_dask_backend():
+    from ludwig.backend.dask import DaskBackend
+    return DaskBackend()
+
+
 backend_registry = {
+    'dask': create_dask_backend,
     'local': get_local_backend,
     None: get_local_backend,
 }

@@ -0,0 +1,38 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from ludwig.backend.base import Backend
+from ludwig.constants import NAME
+from ludwig.data.dataframe.dask import DaskEngine
+
+
+class DaskBackend(Backend):
+    def __init__(self):
+        super().__init__()
+        self._df_engine = DaskEngine()
+
+    @property
+    def df_engine(self):
+        return self._df_engine
+
+    @property
+    def supports_multiprocessing(self):
+        return False
+
+    def check_lazy_load_supported(self, feature):
+        raise ValueError(f'DaskBackend does not support lazy loading of data files at train time. '
+                         f'Set preprocessing config `in_memory: True` for feature {feature[NAME]}')
@@ -103,3 +103,4 @@
 PROC_COLUMN = 'proc_column'
 
 CHECKSUM = 'checksum'
+RESHAPE = 'reshape'
@@ -0,0 +1,32 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from abc import ABC, abstractmethod
+
+
+class Batcher(ABC):
+    @abstractmethod
+    def next_batch(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def last_batch(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def set_epoch(self, epoch):
+        raise NotImplementedError()
@@ -14,65 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-import math
-
 import numpy as np
 
-from ludwig.data.sampler import DistributedSampler
-
-
-class Batcher(object):
-    def __init__(self, dataset, sampler,
-                 batch_size=128,
-                 ignore_last=False):
-        # store our dataset as well
-        self.dataset = dataset
-        self.sampler = sampler
-        self.sample_it = iter(self.sampler)
-
-        self.ignore_last = ignore_last
-        self.batch_size = batch_size
-        self.total_size = len(sampler)
-        self.steps_per_epoch = int(
-            math.ceil(self.total_size / self.batch_size))
-        self.index = 0
-        self.step = 0
-
-    def next_batch(self):
-        if self.last_batch():
-            raise StopIteration()
-
-        indices = []
-        for _ in range(self.batch_size):
-            try:
-                indices.append(next(self.sample_it))
-                self.index += 1
-            except StopIteration:
-                break
-
-        sub_batch = {}
-        for proc_column in self.dataset.features:
-            sub_batch[proc_column] = self.dataset.get(
-                proc_column,
-                indices
-            )
+from ludwig.data.batcher.base import Batcher
 
-        self.step += 1
-        return sub_batch
-
-    def last_batch(self):
-        return self.index >= self.total_size or (
-                self.ignore_last and
-                self.index + self.batch_size >= self.total_size)
 
-    def set_epoch(self, epoch):
-        self.index = 0
-        self.step = 0
-        self.sampler.set_epoch(epoch)
-        self.sample_it = iter(self.sampler)
-
-
-class BucketedBatcher(object):
+class BucketedBatcher(Batcher):
     def __init__(self, dataset, bucketing_field, batch_size=128, buckets=10,
                  should_shuffle=True, ignore_last=False,
                  should_trim=False, trim_side='right'):
@@ -117,8 +64,7 @@ def next_batch(self):
         if self.last_batch():
             if self.should_shuffle:
                 self.shuffle(self.buckets_idcs)
-            self.reset()
-            self.epoch += 1
+            self.set_epoch(self.epoch + 1)
 
         if self.ignore_last:
             idcs_below_size = self.indices + self.batch_size < self.bucket_sizes
@@ -157,9 +103,10 @@ def last_batch(self):
                        self.indices + self.batch_size < self.bucket_sizes
                    ))
 
-    def reset(self):
+    def set_epoch(self, epoch):
         self.indices = np.array([0] * len(self.buckets_idcs))
         self.step = 0
+        self.epoch = epoch
 
 
 # todo future: reintroduce the bucketed batcher
@@ -213,18 +160,3 @@ def reset(self):
 #             ignore_last=ignore_last
 #         )
 #     return batcher
-
-def initialize_batcher(dataset, batch_size=128,
-                       should_shuffle=True,
-                       seed=0,
-                       ignore_last=False,
-                       horovod=None):
-    sampler = DistributedSampler(len(dataset),
-                                 shuffle=should_shuffle,
-                                 seed=seed,
-                                 horovod=horovod)
-    batcher = Batcher(dataset,
-                      sampler,
-                      batch_size=batch_size,
-                      ignore_last=ignore_last)
-    return batcher
@@ -0,0 +1,61 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from ludwig.data.batcher.base import Batcher
+
+
+class IterableBatcher(Batcher):
+    def __init__(self,
+                 dataset,
+                 iterable_dataset,
+                 batch_size,
+                 steps_per_epoch,
+                 shuffle_buffer_size,
+                 ignore_last=False):
+
+        if shuffle_buffer_size > 0:
+            iterable_dataset = iterable_dataset.shuffle(shuffle_buffer_size)
+        iterable_dataset = iterable_dataset.batch(batch_size)
+
+        self.dataset = dataset
+        self.data_it = iter(iterable_dataset)
+
+        self.ignore_last = ignore_last
+        self.steps_per_epoch = steps_per_epoch
+        self.step = 0
+
+    def next_batch(self):
+        if self.last_batch():
+            raise StopIteration()
+
+        sub_batch = {}
+        batch = next(self.data_it)
+        for features_name in self.dataset.features:
+            sub_batch[features_name] = self.dataset.get(
+                features_name,
+                batch
+            )
+
+        self.step += 1
+        return sub_batch
+
+    def last_batch(self):
+        return self.step >= self.steps_per_epoch or (
+                self.ignore_last and
+                self.step + 1 >= self.steps_per_epoch)
+
+    def set_epoch(self, epoch):
+        self.step = 0
@@ -0,0 +1,70 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2019 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import math
+
+from ludwig.data.batcher.base import Batcher
+
+
+class RandomAccessBatcher(Batcher):
+    def __init__(self, dataset, sampler,
+                 batch_size=128,
+                 ignore_last=False):
+        # store our dataset as well
+        self.dataset = dataset
+        self.sampler = sampler
+        self.sample_it = iter(self.sampler)
+
+        self.ignore_last = ignore_last
+        self.batch_size = batch_size
+        self.total_size = len(sampler)
+        self.steps_per_epoch = int(
+            math.ceil(self.total_size / self.batch_size))
+        self.index = 0
+        self.step = 0
+
+    def next_batch(self):
+        if self.last_batch():
+            raise StopIteration()
+
+        indices = []
+        for _ in range(self.batch_size):
+            try:
+                indices.append(next(self.sample_it))
+                self.index += 1
+            except StopIteration:
+                break
+
+        sub_batch = {}
+        for proc_column in self.dataset.features:
+            sub_batch[proc_column] = self.dataset.get(
+                proc_column,
+                indices
+            )
+
+        self.step += 1
+        return sub_batch
+
+    def last_batch(self):
+        return self.index >= self.total_size or (
+                self.ignore_last and
+                self.index + self.batch_size >= self.total_size)
+
+    def set_epoch(self, epoch):
+        self.index = 0
+        self.step = 0
+        self.sampler.set_epoch(epoch)
+        self.sample_it = iter(self.sampler)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -103,3 +103,4 @@
		PROC_COLUMN = 'proc_column'

		CHECKSUM = 'checksum'
		RESHAPE = 'reshape'