cleanlab · 01PrathamS · Oct 13, 2023 · Oct 17, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/cleanlab/datalab/datalab.py b/cleanlab/datalab/datalab.py
@@ -28,19 +28,19 @@
 import pandas as pd
 
 import cleanlab
-from cleanlab.datalab.internal.adapter.imagelab import create_imagelab
+from cleanlab.datalab.internal.adapter.imagelab import (
+    create_imagelab,
+)
 from cleanlab.datalab.internal.data import Data
 from cleanlab.datalab.internal.display import _Displayer
 from cleanlab.datalab.internal.helper_factory import (
-    _DataIssuesBuilder,
+    data_issues_factory,
     issue_finder_factory,
     report_factory,
 )
-from cleanlab.datalab.internal.issue_manager_factory import (
-    list_default_issue_types as _list_default_issue_types,
-    list_possible_issue_types as _list_possible_issue_types,
-)
+from cleanlab.datalab.internal.issue_finder import IssueFinder
 from cleanlab.datalab.internal.serialize import _Serializer
+from cleanlab.datalab.internal.spurious_correlation import SpuriousCorrelations
 
 if TYPE_CHECKING:  # pragma: no cover
     import numpy.typing as npt
@@ -49,7 +49,6 @@
 
     DatasetLike = Union[Dataset, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]], str]
 
-
 __all__ = ["Datalab"]
 
 
@@ -78,10 +77,6 @@ class Datalab:
             - path to a local file: Text (.txt), CSV (.csv), JSON (.json)
             - or a dataset identifier on the Hugging Face Hub
 
-    task : str
-        The type of machine learning task that the dataset is used for.
-        By default, this is set to "classification", but you can also set it to "regression" if you are working with a regression dataset.
-
     label_name : str, optional
         The name of the label column in the dataset.
 
@@ -106,30 +101,20 @@ class Datalab:
     def __init__(
         self,
         data: "DatasetLike",
-        task: str = "classification",
         label_name: Optional[str] = None,
         image_key: Optional[str] = None,
         verbosity: int = 1,
     ) -> None:
-        # Assume continuous values of labels for regression task
-        # Map labels to integers for classification task
-        map_labels_to_int = task == "classification"  # TODO: handle more generally
-
-        self._data = Data(data, label_name, map_to_int=map_labels_to_int)
+        self._data = Data(data, label_name)
         self.data = self._data._data
-        self.task = task
         self._labels = self._data.labels
         self._label_map = self._labels.label_map
         self.label_name = self._labels.label_name
         self._data_hash = self._data._data_hash
         self.cleanlab_version = cleanlab.version.__version__
         self.verbosity = verbosity
         self._imagelab = create_imagelab(dataset=self.data, image_key=image_key)
-
-        # Create the builder for DataIssues
-        builder = _DataIssuesBuilder(self._data)
-        builder.set_imagelab(self._imagelab).set_task(task)
-        self.data_issues = builder.build()
+        self.data_issues = data_issues_factory(self._imagelab)(self._data)
 
     # todo: check displayer methods
     def __repr__(self) -> str:
@@ -145,7 +130,7 @@ def labels(self) -> np.ndarray:
 
     @property
     def has_labels(self) -> bool:
-        """Whether the dataset has labels, and that they are in a [0, 1, ..., K-1] format."""
+        """Whether the dataset has labels."""
         return self._labels.is_available
 
     @property
@@ -306,21 +291,84 @@ def find_issues(
                 "No issue types were specified so no issues will be found in the dataset. Set `issue_types` as None to consider a default set of issues."
             )
             return None
-        issue_finder = issue_finder_factory(self._imagelab)(
-            datalab=self, task=self.task, verbosity=self.verbosity
-        )
+
+        issue_finder = issue_finder_factory(self._imagelab)(datalab=self, verbosity=self.verbosity)
         issue_finder.find_issues(
             pred_probs=pred_probs,
             features=features,
             knn_graph=knn_graph,
             issue_types=issue_types,
         )
-
         if self.verbosity:
             print(
                 f"\nAudit complete. {self.data_issues.issue_summary['num_issues'].sum()} issues found in the dataset."
             )
 
+    def _spurious_correlations(self, properties: Optional[List[str]] = None) -> pd.DataFrame:
+        """
+        Identify potential spurious correlations between image properties and their corresponding scores.
+
+
+        Parameters:
+        -----------
+        properties : Optional[List[str]]
+            A list of specific image properties (e.g. 'dark', 'grayscale') to be analyzed.
+            If None, all available properties from the issue summary will be considered.
+
+        Returns:
+        --------
+        A DataFrame indicating correlations for each image property.
+
+        Note
+        ----
+        This method is a wrapper around the :py:meth:`SpuriousCorrelations.calculate_correlations <cleanlab.datalab.internal.spurious_correlation.SpuriousCorrelations.calculate_correlations>` method.
+
+        It is still a work in progress and may be subject to change in future versions.
+
+        See Also
+        --------
+        cleanlab.datalab.internal.spurious_correlation.SpuriousCorrelations
+        """
+        # TODO: Update this check when support for more properties is added.
+        if self._imagelab is None:
+            raise NotImplementedError("No ImageLab instance found. Please specify properties.")
+
+        # TODO: Update this check when support for more properties is added.
+        if self._imagelab.issue_summary.empty or self.issues.empty:
+            raise ValueError("No issues found in ImageLab. Please run find_issues() first.")
+
+        # Default to all available properties from the issue summary.
+        if properties is None:
+            _issue_summary = self._imagelab.issue_summary
+            properties = _issue_summary["issue_type"].values.tolist()
+
+            # Ensure only properties present in both datalab and imagelab are considered.
+            if self._imagelab:
+                properties = [
+                    p for p in properties if p in self.issue_summary["issue_type"].values.tolist()
+                ]
+
+        # Validate the input properties.
+        valid_properties = self.issue_summary["issue_type"].values.tolist()
+        for p in properties:
+            if p not in valid_properties:
+                raise ValueError(
+                    f"{p} is not a valid property. Available options: {valid_properties}"
+                )
+
+        # Convert score column names to regular column names for easier querying.
+        score_column_to_column_name = lambda name_score: name_score.split("_score")[0]
+        score_columns = [c for c in self.issues.columns if c.endswith("_score")]
+        rename_map = dict(
+            zip(score_columns, [score_column_to_column_name(c) for c in score_columns])
+        )
+
+        # Filter and rename columns in the issues dataframe.
+        df = self.issues[score_columns].rename(columns=rename_map)
+        df = df[[c for c in rename_map.values() if c in properties]]
+
+        return SpuriousCorrelations(data=df, labels=self.labels).calculate_correlations()
+
     def report(
         self,
         *,
@@ -357,7 +405,6 @@ def report(
 
         reporter = report_factory(self._imagelab)(
             data_issues=self.data_issues,
-            task=self.task,
             verbosity=verbosity,
             include_description=include_description,
             show_summary_score=show_summary_score,
@@ -504,7 +551,8 @@ def get_info(self, issue_name: Optional[str] = None) -> Dict[str, Any]:
         """
         return self.data_issues.get_info(issue_name)
 
-    def list_possible_issue_types(self) -> List[str]:
+    @staticmethod
+    def list_possible_issue_types() -> List[str]:
         """Returns a list of all registered issue types.
 
         Any issue type that is not in this list cannot be used in the :py:meth:`find_issues` method.
@@ -517,9 +565,10 @@ def list_possible_issue_types(self) -> List[str]:
         --------
         :py:class:`REGISTRY <cleanlab.datalab.internal.issue_manager_factory.REGISTRY>` : All available issue types and their corresponding issue managers can be found here.
         """
-        return _list_possible_issue_types(task=self.task)
+        return IssueFinder.list_possible_issue_types()
 
-    def list_default_issue_types(self) -> List[str]:
+    @staticmethod
+    def list_default_issue_types() -> List[str]:
         """Returns a list of the issue types that are run by default
         when :py:meth:`find_issues` is called without specifying `issue_types`.
 
@@ -531,7 +580,7 @@ def list_default_issue_types(self) -> List[str]:
         --------
         :py:class:`REGISTRY <cleanlab.datalab.internal.issue_manager_factory.REGISTRY>` : All available issue types and their corresponding issue managers can be found here.
         """
-        return _list_default_issue_types(task=self.task)
+        return IssueFinder.list_default_issue_types()
 
     def save(self, path: str, force: bool = False) -> None:
         """Saves this Datalab object to file (all files are in folder at `path/`).

diff --git a/cleanlab/datalab/internal/adapter/imagelab.py b/cleanlab/datalab/internal/adapter/imagelab.py
@@ -16,7 +16,7 @@
     IMAGELAB_ISSUES_MAX_PREVALENCE,
 )
 from cleanlab.datalab.internal.data import Data
-from cleanlab.datalab.internal.data_issues import DataIssues, _InfoStrategy
+from cleanlab.datalab.internal.data_issues import DataIssues
 from cleanlab.datalab.internal.issue_finder import IssueFinder
 from cleanlab.datalab.internal.report import Reporter
 
@@ -70,8 +70,6 @@ class ImagelabDataIssuesAdapter(DataIssues):
     ----------
     data :
         The data object for which the issues are being collected.
-    strategy :
-        Strategy used for processing info dictionaries.
 
     Parameters
     ----------
@@ -84,8 +82,8 @@ class ImagelabDataIssuesAdapter(DataIssues):
         A dictionary that contains information and statistics about the data and each issue type.
     """
 
-    def __init__(self, data: Data, strategy: _InfoStrategy) -> None:
-        super().__init__(data, strategy)
+    def __init__(self, data: Data) -> None:
+        super().__init__(data)
 
     def _update_issues_imagelab(self, imagelab: "Imagelab", overlapping_issues: List[str]) -> None:
         overwrite_columns = [f"is_{issue_type}_issue" for issue_type in overlapping_issues]
@@ -145,14 +143,12 @@ def __init__(
         self,
         data_issues: "DataIssues",
         imagelab: "Imagelab",
-        task: str,
         verbosity: int = 1,
         include_description: bool = True,
         show_summary_score: bool = False,
     ):
         super().__init__(
             data_issues=data_issues,
-            task=task,
             verbosity=verbosity,
             include_description=include_description,
             show_summary_score=show_summary_score,
@@ -168,8 +164,8 @@ def report(self, num_examples: int) -> None:
 
 
 class ImagelabIssueFinderAdapter(IssueFinder):
-    def __init__(self, datalab, task, verbosity):
-        super().__init__(datalab, task, verbosity)
+    def __init__(self, datalab, verbosity):
+        super().__init__(datalab, verbosity)
         self.imagelab = self.datalab._imagelab
 
     def _get_imagelab_issue_types(self, issue_types, **kwargs):

diff --git a/cleanlab/datalab/internal/data.py b/cleanlab/datalab/internal/data.py
@@ -116,10 +116,6 @@ class Data:
     label_name : Union[str, List[str]]
         Name of the label column in the dataset.
 
-    map_to_int : bool
-        Whether to map the labels to integers, e.g. [0, 1, ..., K-1] where K is the number of classes.
-        If False, the labels are not mapped to integers, e.g. for regression tasks.
-
     Warnings
     --------
     Optional dependencies:
@@ -130,13 +126,11 @@ class Data:
         :py:class:`Datalab <cleanlab.datalab.datalab.Datalab>` to work.
     """
 
-    def __init__(
-        self, data: "DatasetLike", label_name: Optional[str] = None, map_to_int: bool = True
-    ) -> None:
+    def __init__(self, data: "DatasetLike", label_name: Optional[str] = None) -> None:
         self._validate_data(data)
         self._data = self._load_data(data)
         self._data_hash = hash(self._data)
-        self.labels = Label(data=self._data, label_name=label_name, map_to_int=map_to_int)
+        self.labels = Label(data=self._data, label_name=label_name)
 
     def _load_data(self, data: "DatasetLike") -> Dataset:
         """Checks the type of dataset and uses the correct loader method and
@@ -224,31 +218,17 @@ class Label:
     """
     Class to represent labels in a dataset.
 
-    It stores the labels as a numpy array and maps them to integers if necessary.
-    If a mapping is not necessary, e.g. for regression tasks, the mapping will be an empty dictionary.
-
     Parameters
     ----------
-    data :
-        A Hugging Face Dataset object.
-
-    label_name : str
-        Name of the label column in the dataset.
-
-    map_to_int : bool
-        Whether to map the labels to integers, e.g. [0, 1, ..., K-1] where K is the number of classes.
-        If False, the labels are not mapped to integers, e.g. for regression tasks.
     """
 
-    def __init__(
-        self, *, data: Dataset, label_name: Optional[str] = None, map_to_int: bool = True
-    ) -> None:
+    def __init__(self, *, data: Dataset, label_name: Optional[str] = None) -> None:
         self._data = data
         self.label_name = label_name
         self.labels = labels_to_array([])
         self.label_map: Mapping[str, Any] = {}
         if label_name is not None:
-            self.labels, self.label_map = _extract_labels(data, label_name, map_to_int)
+            self.labels, self.label_map = _extract_labels(data, label_name)
             self._validate_labels()
 
     def __len__(self) -> int:
@@ -293,7 +273,7 @@ def _validate_labels(self) -> None:
         assert len(labels) == len(self._data)
 
 
-def _extract_labels(data: Dataset, label_name: str, map_to_int: bool) -> Tuple[np.ndarray, Mapping]:
+def _extract_labels(data: Dataset, label_name: str) -> Tuple[np.ndarray, Mapping]:
     """
     Picks out labels from the dataset and formats them to be [0, 1, ..., K-1]
     where K is the number of classes. Also returns a mapping from the formatted
@@ -305,15 +285,9 @@ def _extract_labels(data: Dataset, label_name: str, map_to_int: bool) -> Tuple[n
 
     Parameters
     ----------
-    data : datasets.Dataset
-        A Hugging Face Dataset object.
-
     label_name : str
         Name of the column in the dataset that contains the labels.
 
-    map_to_int : bool
-        Whether to map the labels to integers, e.g. [0, 1, ..., K-1] where K is the number of classes.
-        If False, the labels are not mapped to integers, e.g. for regression tasks.
     Returns
     -------
     formatted_labels : np.ndarray
@@ -327,9 +301,6 @@ def _extract_labels(data: Dataset, label_name: str, map_to_int: bool) -> Tuple[n
     if labels.ndim != 1:
         raise ValueError("labels must be 1D numpy array.")
 
-    if not map_to_int:
-        # Don't map labels to integers, e.g. for regression tasks
-        return labels, {}
     label_name_feature = data.features[label_name]
     if isinstance(label_name_feature, ClassLabel):
         label_map = {label: label_name_feature.str2int(label) for label in label_name_feature.names}