ludwig-ai · geoffreyangus · Feb 13, 2023 · Feb 10, 2023 · Feb 10, 2023 · Feb 10, 2023
@@ -854,8 +854,8 @@ def __init__(self, input_feature_config: ImageInputFeatureConfig, encoder_obj=No
             )
 
     def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        assert isinstance(inputs, torch.Tensor)
-        assert inputs.dtype in [torch.float32]
+        assert isinstance(inputs, torch.Tensor), f"inputs to image feature must be a torch tensor, got {type(inputs)}"
+        assert inputs.dtype in [torch.float32], f"inputs to image feature must be a float32 tensor, got {inputs.dtype}"
 
         inputs_encoded = self.encoder_obj(inputs)
 

@@ -53,12 +53,12 @@ class TVModelVariant:
 
 @DeveloperAPI
 def get_gray_default_image(num_channels: int, height: int, width: int) -> np.ndarray:
-    return np.full((num_channels, height, width), 128, dtype=np.uint8)
+    return np.full((num_channels, height, width), 128, dtype=np.float32)
 
 
 @DeveloperAPI
 def get_average_image(image_lst: List[np.ndarray]) -> np.array:
-    return np.mean([x for x in image_lst if x is not None], axis=(0))
+    return np.mean([x for x in image_lst if x is not None], axis=(0), dtype=np.float32)
 
 
 @DeveloperAPI

@@ -9,10 +9,12 @@
 import pytest
 from PIL import Image
 
+import ludwig
 from ludwig.api import LudwigModel
 from ludwig.constants import BATCH_SIZE, COLUMN, DECODER, NAME, PROC_COLUMN, TRAINER
 from ludwig.data.concatenate_datasets import concatenate_df
 from tests.integration_tests.utils import (
+    assert_preprocessed_dataset_shape_and_dtype_for_feature,
     audio_feature,
     binary_feature,
     category_feature,
@@ -238,6 +240,42 @@ def test_read_image_from_numpy_array(tmpdir, csv_filename):
     )
 
 
+def test_read_image_failure_default_image(monkeypatch, tmpdir, csv_filename):
+    """Tests that the default image used when an image cannot be read has the correct properties."""
+
+    def mock_read_binary_files(self, column, map_fn, file_size):
+        """Mock read_binary_files to return None (failed image read) to test error handling."""
+        return column.map(lambda x: None)
+
+    monkeypatch.setattr(ludwig.backend.base.LocalPreprocessingMixin, "read_binary_files", mock_read_binary_files)
+
+    image_feature_config = image_feature(os.path.join(tmpdir, "generated_output"))
+    input_features = [image_feature_config]
+    output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")]
+
+    config = {
+        "input_features": input_features,
+        "output_features": output_features,
+        TRAINER: {"epochs": 2, BATCH_SIZE: 128},
+    }
+
+    data_csv = generate_data(
+        input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES, nan_percent=0.2
+    )
+
+    model = LudwigModel(config)
+    preprocessed_dataset = model.preprocess(data_csv)
+    training_set_metadata = preprocessed_dataset.training_set_metadata
+
+    preprocessing = training_set_metadata[input_features[0][NAME]]["preprocessing"]
+    expected_shape = (preprocessing["num_channels"], preprocessing["height"], preprocessing["width"])
+    expected_dtype = np.float32
+
+    assert_preprocessed_dataset_shape_and_dtype_for_feature(
+        image_feature_config[NAME], preprocessed_dataset, model.config_obj, expected_dtype, expected_shape
+    )
+
+
 def test_number_feature_wrong_dtype(csv_filename, tmpdir):
     """Tests that a number feature with all string values is treated as having missing values by default."""
     data_csv_path = os.path.join(tmpdir, csv_filename)

@@ -24,7 +24,7 @@
 import traceback
 import uuid
 from distutils.util import strtobool
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
 
 import cloudpickle
 import numpy as np
@@ -66,6 +66,10 @@
 from ludwig.utils import fs_utils
 from ludwig.utils.data_utils import read_csv, replace_file_extension, use_credentials
 
+if TYPE_CHECKING:
+    from ludwig.data.dataset.base import Dataset
+    from ludwig.schema.model_types.base import ModelConfig
+
 logger = logging.getLogger(__name__)
 
 # Used in sequence-related unit tests (encoders, features) as well as end-to-end integration tests.
@@ -961,6 +965,54 @@ def assert_all_required_metrics_exist(
             ), f"required metrics {required_metric_names} not in metrics {metric_names} for feature {feature_name}"
 
 
+def assert_preprocessed_dataset_shape_and_dtype_for_feature(
+    feature_name: str,
+    preprocessed_dataset: "Dataset",
+    config_obj: "ModelConfig",
+    expected_dtype: np.dtype,
+    expected_shape: Tuple,
+):
+    """Asserts that the preprocessed dataset has the correct shape and dtype for a given feature type.
+
+    Args:
+        feature_name: the name of the feature to check
+        preprocessed_dataset: the preprocessed dataset
+        config_obj: the model config object
+        expected_dtype: the expected dtype
+        expected_shape: the expected shape
+    Returns:
+        None.
+    Raises:
+        AssertionError if the preprocessed dataset does not have the correct shape and dtype for the given feature type.
+    """
+    if_configs = [if_config for if_config in config_obj.input_features if if_config.name == feature_name]
+    # fail fast if given `feature_name`` is not found or is not unique
+    if len(if_configs) != 1:
+        raise ValueError(f"feature_name {feature_name} found {len(if_configs)} times in config_obj")
+    if_config = if_configs[0]
+
+    if_config_proc_column = if_config.proc_column
+    for result in [
+        preprocessed_dataset.training_set,
+        preprocessed_dataset.validation_set,
+        preprocessed_dataset.test_set,
+    ]:
+        result_df = result.to_df()
+        result_df_proc_col = result_df[if_config_proc_column]
+
+        # Check that the proc col is of the correct dtype
+        result_df_proc_col_dtypes = set(result_df_proc_col.map(lambda x: x.dtype))
+        assert all(
+            [expected_dtype == dtype for dtype in result_df_proc_col_dtypes]
+        ), f"proc dtype should be {expected_dtype}, got the following set of values: {result_df_proc_col_dtypes}"
+
+        # Check that the proc col is of the right dimensions
+        result_df_proc_col_shapes = set(result_df_proc_col.map(lambda x: x.shape))
+        assert all(
+            expected_shape == shape for shape in result_df_proc_col_shapes
+        ), f"proc shape should be {expected_shape}, got the following set of values: {result_df_proc_col_shapes}"
+
+
 @contextlib.contextmanager
 def remote_tmpdir(fs_protocol, bucket):
     if bucket is None: