Cyto utilities¶

Functions enabling smooth interaction with CellProfiler and DeepProfiler output formats.

A variety of utility functions for working with cytominer data.

`pycytominer.cyto_utils.DeepProfiler_processing` ¶

Utility function to load and process the output files of a DeepProfiler run.

`AggregateDeepProfiler` ¶

Class that holds all functions needed to aggregate the DeepProfiler (DP) run.

Attributes:

Name	Type	Description
`deep_data`	`DeepProfilerData`	DeepProfilerData object to load data from DeepProfiler project
`aggregated_profiles`	`DataFrame`	df to hold the metadata and profiles.
`file_aggregate`	`dict`	dict that holds the file names and metadata. Is used to load in the npz files in the correct order and grouping.
`output_file`	`str`	If provided, will write annotated profiles to folder. Defaults to None.

Methods:

Name	Description
`aggregate_deep`	Given an initialized AggregateDeepProfiler() class, run this function to output level 3 profiles (aggregated profiles with annotated metadata).

Example

import pathlib from pycytominer.cyto_utils import DeepProfiler_processing

index_file = pathlib.Path("path/to/index.csv") profile_dir = pathlib.Path("path/to/features/")

deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz") deep_aggregate = DeepProfiler_processing.AggregateDeepProfiler(deep_data) deep_aggregate = aggregate.aggregate_deep()

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

class AggregateDeepProfiler:
    """Class that holds all functions needed to aggregate the DeepProfiler (DP) run.

    Attributes
    ----------
    deep_data : DeepProfilerData
        DeepProfilerData object to load data from DeepProfiler project
    aggregated_profiles : pandas.DataFrame
        df to hold the metadata and profiles.
    file_aggregate : dict
        dict that holds the file names and metadata.
        Is used to load in the npz files in the correct order and grouping.
    output_file : str
        If provided, will write annotated profiles to folder. Defaults to None.

    Methods
    -------
    aggregate_deep()
        Given an initialized AggregateDeepProfiler() class, run this function to output
        level 3 profiles (aggregated profiles with annotated metadata).

    Example
    -------
    import pathlib
    from pycytominer.cyto_utils import DeepProfiler_processing

    index_file = pathlib.Path("path/to/index.csv")
    profile_dir = pathlib.Path("path/to/features/")

    deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz")
    deep_aggregate = DeepProfiler_processing.AggregateDeepProfiler(deep_data)
    deep_aggregate = aggregate.aggregate_deep()
    """

    def __init__(
        self,
        deep_data: DeepProfilerData,
        aggregate_operation="median",
        aggregate_on="well",
        output_file=None,
    ):
        """
        __init__ function for this class.

        Arguments
        ---------
        See above for all parameters.
        """
        assert aggregate_operation in [  # noqa: S101
            "median",
            "mean",
        ], "Input of aggregate_operation is incorrect, it must be either median or mean"
        assert aggregate_on in [  # noqa: S101
            "site",
            "well",
            "plate",
        ], "Input of aggregate_on is incorrect, it must be either site or well or plate"

        self.deep_data = deep_data
        self.aggregate_operation = aggregate_operation
        self.aggregate_on = aggregate_on
        self.output_file = output_file

    def setup_aggregate(self):
        """Set up the file_aggregate attribute.

        A helper function to `aggregate_deep` that aggregate the file_aggregate dictionary contains the file locations and metadata for each grouping.
        If for example we are grouping by well then the keys of self.file_aggregate would be:
        plate1/well1, plate1/well2, plate2/well1, etc.
        """
        if not hasattr(self.deep_data, "filenames"):
            self.deep_data.build_filenames()

        self.file_aggregate = {}
        for filename in self.deep_data.filenames:
            file_info = self.deep_data.extract_filename_metadata(
                filename, self.deep_data.filename_delimiter
            )
            file_key = file_info[self.aggregate_on]

            if self.aggregate_on == "site":
                file_key = (
                    f"{file_info['plate']}/{file_info['well']}_{file_info['site']}"
                )

            if self.aggregate_on == "well":
                file_key = f"{file_info['plate']}/{file_info['well']}"

            if file_key in self.file_aggregate:
                self.file_aggregate[file_key]["files"].append(filename)
            else:
                self.file_aggregate[file_key] = {}
                self.file_aggregate[file_key]["files"] = [filename]

            self.file_aggregate[file_key]["metadata"] = file_info

    def aggregate_deep(self):
        """
        Aggregate the DeepProfiler profiles into a pandas dataframe.

        For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated.
        If files are missing, we throw a warning but continue the code.
        After aggregation, the metadata is concatenated back onto the dataframe.

        Returns
        -------
        df_out : pandas.dataframe
            dataframe with all metadata and the feature space.
            This is the input to any further pycytominer or pycytominer-eval processing
        """
        if not hasattr(self, "file_aggregate"):
            self.setup_aggregate()

        self.aggregated_profiles = []
        self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

        # Iterates over all sites, wells or plates
        for metadata_level in self.file_aggregate:
            # uses custom load function to create df with metadata and profiles
            arr = [
                load_npz_features(x)
                for x in self.file_aggregate[metadata_level]["files"]
            ]
            # empty dataframes from missing files are deleted
            arr = [x for x in arr if not x.empty]
            # if no files were found there is a miss-match between the index and the output files
            if not len(arr):
                warnings.warn(
                    f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!"
                )
                continue
            df = pd.concat(arr)

            # extract metadata prior to aggregation
            meta_df = pd.DataFrame()
            metadata_cols = infer_cp_features(df, metadata=True)
            profiles = [x for x in df.columns.tolist() if x not in metadata_cols]

            # If all rows have the same Metadata information, that value is valid for the aggregated df
            for col in metadata_cols:
                if len(df[col].unique()) == 1:
                    meta_df[col] = [df[col].unique()[0]]

            # perform the aggregation
            df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
            df = aggregate.aggregate(
                population_df=df,
                strata="Metadata_Aggregate_On",
                features=profiles,
                operation=self.aggregate_operation,
            ).reset_index(drop=True)

            # add the aggregation level as a column
            df.loc[:, self.aggregate_merge_col] = metadata_level
            # concatenate the metadata back onto the aggregated profile
            df = pd.concat([df, meta_df], axis=1)

            # save metalevel file
            if self.output_file is not None:
                if not os.path.exists(self.output_file):
                    os.mkdir(self.output_file)
                file_path = os.path.join(
                    self.output_file, metadata_level.replace("/", "_")
                )
                df.to_csv(f"{file_path}.csv", index=False)
            self.aggregated_profiles.append(df)

        # Concatenate all of the above created profiles
        self.aggregated_profiles = pd.concat(
            list(self.aggregated_profiles)
        ).reset_index(drop=True)

        # clean and reindex columns
        self.aggregated_profiles.columns = [
            str(x) for x in self.aggregated_profiles.columns
        ]
        meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
        reindex_profiles = [str(x) for x in profiles]
        self.aggregated_profiles = self.aggregated_profiles.reindex(
            meta_features + reindex_profiles, axis="columns"
        )

        # If Columns have NaN values from concatenation, drop these
        self.aggregated_profiles.dropna(axis="columns", inplace=True)

        df_out = self.aggregated_profiles
        return df_out

`init(deep_data, aggregate_operation='median', aggregate_on='well', output_file=None)` ¶

init function for this class.

Arguments

See above for all parameters.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def __init__(
    self,
    deep_data: DeepProfilerData,
    aggregate_operation="median",
    aggregate_on="well",
    output_file=None,
):
    """
    __init__ function for this class.

    Arguments
    ---------
    See above for all parameters.
    """
    assert aggregate_operation in [  # noqa: S101
        "median",
        "mean",
    ], "Input of aggregate_operation is incorrect, it must be either median or mean"
    assert aggregate_on in [  # noqa: S101
        "site",
        "well",
        "plate",
    ], "Input of aggregate_on is incorrect, it must be either site or well or plate"

    self.deep_data = deep_data
    self.aggregate_operation = aggregate_operation
    self.aggregate_on = aggregate_on
    self.output_file = output_file

`aggregate_deep()` ¶

Aggregate the DeepProfiler profiles into a pandas dataframe.

For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated. If files are missing, we throw a warning but continue the code. After aggregation, the metadata is concatenated back onto the dataframe.

Returns:

Name	Type	Description
`df_out`	`dataframe`	dataframe with all metadata and the feature space. This is the input to any further pycytominer or pycytominer-eval processing

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def aggregate_deep(self):
    """
    Aggregate the DeepProfiler profiles into a pandas dataframe.

    For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated.
    If files are missing, we throw a warning but continue the code.
    After aggregation, the metadata is concatenated back onto the dataframe.

    Returns
    -------
    df_out : pandas.dataframe
        dataframe with all metadata and the feature space.
        This is the input to any further pycytominer or pycytominer-eval processing
    """
    if not hasattr(self, "file_aggregate"):
        self.setup_aggregate()

    self.aggregated_profiles = []
    self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

    # Iterates over all sites, wells or plates
    for metadata_level in self.file_aggregate:
        # uses custom load function to create df with metadata and profiles
        arr = [
            load_npz_features(x)
            for x in self.file_aggregate[metadata_level]["files"]
        ]
        # empty dataframes from missing files are deleted
        arr = [x for x in arr if not x.empty]
        # if no files were found there is a miss-match between the index and the output files
        if not len(arr):
            warnings.warn(
                f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!"
            )
            continue
        df = pd.concat(arr)

        # extract metadata prior to aggregation
        meta_df = pd.DataFrame()
        metadata_cols = infer_cp_features(df, metadata=True)
        profiles = [x for x in df.columns.tolist() if x not in metadata_cols]

        # If all rows have the same Metadata information, that value is valid for the aggregated df
        for col in metadata_cols:
            if len(df[col].unique()) == 1:
                meta_df[col] = [df[col].unique()[0]]

        # perform the aggregation
        df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
        df = aggregate.aggregate(
            population_df=df,
            strata="Metadata_Aggregate_On",
            features=profiles,
            operation=self.aggregate_operation,
        ).reset_index(drop=True)

        # add the aggregation level as a column
        df.loc[:, self.aggregate_merge_col] = metadata_level
        # concatenate the metadata back onto the aggregated profile
        df = pd.concat([df, meta_df], axis=1)

        # save metalevel file
        if self.output_file is not None:
            if not os.path.exists(self.output_file):
                os.mkdir(self.output_file)
            file_path = os.path.join(
                self.output_file, metadata_level.replace("/", "_")
            )
            df.to_csv(f"{file_path}.csv", index=False)
        self.aggregated_profiles.append(df)

    # Concatenate all of the above created profiles
    self.aggregated_profiles = pd.concat(
        list(self.aggregated_profiles)
    ).reset_index(drop=True)

    # clean and reindex columns
    self.aggregated_profiles.columns = [
        str(x) for x in self.aggregated_profiles.columns
    ]
    meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
    reindex_profiles = [str(x) for x in profiles]
    self.aggregated_profiles = self.aggregated_profiles.reindex(
        meta_features + reindex_profiles, axis="columns"
    )

    # If Columns have NaN values from concatenation, drop these
    self.aggregated_profiles.dropna(axis="columns", inplace=True)

    df_out = self.aggregated_profiles
    return df_out

`setup_aggregate()` ¶

Set up the file_aggregate attribute.

A helper function to aggregate_deep that aggregate the file_aggregate dictionary contains the file locations and metadata for each grouping. If for example we are grouping by well then the keys of self.file_aggregate would be: plate1/well1, plate1/well2, plate2/well1, etc.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def setup_aggregate(self):
    """Set up the file_aggregate attribute.

    A helper function to `aggregate_deep` that aggregate the file_aggregate dictionary contains the file locations and metadata for each grouping.
    If for example we are grouping by well then the keys of self.file_aggregate would be:
    plate1/well1, plate1/well2, plate2/well1, etc.
    """
    if not hasattr(self.deep_data, "filenames"):
        self.deep_data.build_filenames()

    self.file_aggregate = {}
    for filename in self.deep_data.filenames:
        file_info = self.deep_data.extract_filename_metadata(
            filename, self.deep_data.filename_delimiter
        )
        file_key = file_info[self.aggregate_on]

        if self.aggregate_on == "site":
            file_key = (
                f"{file_info['plate']}/{file_info['well']}_{file_info['site']}"
            )

        if self.aggregate_on == "well":
            file_key = f"{file_info['plate']}/{file_info['well']}"

        if file_key in self.file_aggregate:
            self.file_aggregate[file_key]["files"].append(filename)
        else:
            self.file_aggregate[file_key] = {}
            self.file_aggregate[file_key]["files"] = [filename]

        self.file_aggregate[file_key]["metadata"] = file_info

`DeepProfilerData` ¶

Class that holds all functions needed to load and annotate the DeepProfiler (DP) run.

Attributes:

Name	Type	Description
`profile_dir`	`str`	file location of the output profiles from DeepProfiler (e.g. `/project1/outputs/results/features/`)
`filename_delimiter`	`default = '_'`	delimiter for the filenames of the profiles (e.g. B02_4.npz).
`file_extension`	`default = '.npz'`	extension of the profile file.
`index_df`	`DataFrame`	load in the index.csv file from DeepProfiler, provided by an input index file.
`filenames`	`list of paths`	list of Purepaths that point to the npz files.

Methods:

Name	Description
`build_filenames`	build filenames from index_df
`extract_filename_metadata`	get site, well, plate info for npz file

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

class DeepProfilerData:
    """Class that holds all functions needed to load and annotate the DeepProfiler (DP) run.

    Attributes
    ----------
    profile_dir : str
        file location of the output profiles from DeepProfiler
        (e.g. `/project1/outputs/results/features/`)
    filename_delimiter : default = '_'
        delimiter for the filenames of the profiles (e.g. B02_4.npz).
    file_extension : default = '.npz'
        extension of the profile file.
    index_df : pandas.DataFrame
        load in the index.csv file from DeepProfiler, provided by an input index file.
    filenames : list of paths
        list of Purepaths that point to the npz files.

    Methods
    -------
    build_filenames()
        build filenames from index_df
    extract_filename_metadata(npz_file, delimiter="_")
        get site, well, plate info for npz file
    """

    def __init__(
        self,
        index_file,
        profile_dir,
        filename_delimiter="_",
        file_extension=".npz",
    ):
        """
        __init__ function for this class.

        Arguments
        ---------
        index_file : str
            file location of the index.csv from DP

        See above for all other parameters.
        """
        self.index_df = pd.read_csv(index_file, dtype=str)
        self.profile_dir = profile_dir
        self.filename_delimiter = filename_delimiter
        self.file_extension = file_extension
        if not self.file_extension.startswith("."):
            self.file_extension = f".{self.file_extension}"

    def build_filenames(self):
        """Create file names indicated by plate, well, and site information."""
        self.filenames = self.index_df.apply(
            self.build_filename_from_index, axis="columns"
        )
        self.filenames = [
            pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
        ]

    def build_filename_from_index(self, row):
        """Build the name of the profile files."""
        plate = row["Metadata_Plate"]
        well = row["Metadata_Well"]
        site = row["Metadata_Site"]

        filename = f"{plate}/{well}{self.filename_delimiter}{site}{self.file_extension}"
        return filename

    def extract_filename_metadata(self, npz_file, delimiter="_"):
        """Extract metadata (site, well and plate) from the filename.

        This function is used to extract the metadata from the filename of the npz files.
        It expects a naming convetion of path/plate/well{delimiter}site.npz.

        Arguments
        ---------
        npz_file : str
            file path

        delimiter : str
            the delimiter used in the naming convention of the files. default = '_'

        Returns
        -------
        loc : dict
            dict with metadata
        """
        if delimiter == "/":
            site = str(npz_file).split("/")[-1].strip(".npz")
            well = str(npz_file).split("/")[-2]
        else:
            base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
            site = base_file[-1]
            well = base_file[-2]
        plate = str(npz_file).split("/")[-2]

        loc = {"site": site, "well": well, "plate": plate}
        return loc

`init(index_file, profile_dir, filename_delimiter='_', file_extension='.npz')` ¶

init function for this class.

Arguments

index_file : str file location of the index.csv from DP

See above for all other parameters.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def __init__(
    self,
    index_file,
    profile_dir,
    filename_delimiter="_",
    file_extension=".npz",
):
    """
    __init__ function for this class.

    Arguments
    ---------
    index_file : str
        file location of the index.csv from DP

    See above for all other parameters.
    """
    self.index_df = pd.read_csv(index_file, dtype=str)
    self.profile_dir = profile_dir
    self.filename_delimiter = filename_delimiter
    self.file_extension = file_extension
    if not self.file_extension.startswith("."):
        self.file_extension = f".{self.file_extension}"

`build_filename_from_index(row)` ¶

Build the name of the profile files.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def build_filename_from_index(self, row):
    """Build the name of the profile files."""
    plate = row["Metadata_Plate"]
    well = row["Metadata_Well"]
    site = row["Metadata_Site"]

    filename = f"{plate}/{well}{self.filename_delimiter}{site}{self.file_extension}"
    return filename

`build_filenames()` ¶

Create file names indicated by plate, well, and site information.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def build_filenames(self):
    """Create file names indicated by plate, well, and site information."""
    self.filenames = self.index_df.apply(
        self.build_filename_from_index, axis="columns"
    )
    self.filenames = [
        pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
    ]

`extract_filename_metadata(npz_file, delimiter='_')` ¶

Extract metadata (site, well and plate) from the filename.

This function is used to extract the metadata from the filename of the npz files. It expects a naming convetion of path/plate/well{delimiter}site.npz.

Arguments

npz_file : str file path

delimiter : str the delimiter used in the naming convention of the files. default = '_'

Returns:

Name	Type	Description
`loc`	`dict`	dict with metadata

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def extract_filename_metadata(self, npz_file, delimiter="_"):
    """Extract metadata (site, well and plate) from the filename.

    This function is used to extract the metadata from the filename of the npz files.
    It expects a naming convetion of path/plate/well{delimiter}site.npz.

    Arguments
    ---------
    npz_file : str
        file path

    delimiter : str
        the delimiter used in the naming convention of the files. default = '_'

    Returns
    -------
    loc : dict
        dict with metadata
    """
    if delimiter == "/":
        site = str(npz_file).split("/")[-1].strip(".npz")
        well = str(npz_file).split("/")[-2]
    else:
        base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
        site = base_file[-1]
        well = base_file[-2]
    plate = str(npz_file).split("/")[-2]

    loc = {"site": site, "well": well, "plate": plate}
    return loc

`SingleCellDeepProfiler` ¶

Class that holds functions needed to analyze single cells from the DeepProfiler (DP) run.

Only pycytominer.normalization() is implemented.

Attributes:

Name	Type	Description
`deep_data`	`DeepProfilerData`	DeepProfilerData object to load data from DeepProfiler project
`aggregated_profiles`	`DataFrame`	df to hold the metadata and profiles.
`file_aggregate`	`dict`	dict that holds the file names and metadata. Is used to load in the npz files in the correct order and grouping.
`output_file`	`str`	If provided, will write annotated profiles to folder. Defaults to None.

Methods:

Name	Description
`normalize`
`float_format, mad_robustize_epsilon, spherize_center, spherize_method, spherize_epsilon)`	normalize profiling features from DeepProfiler run with pycytominer.normalize()

Example

import pathlib from pycytominer.cyto_utils import DeepProfiler_processing

index_file = pathlib.Path("path/to/index.csv") profile_dir = pathlib.Path("path/to/features/")

deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz") deep_single_cell = DeepProfiler_processing.SingleCellDeepProfiler(deep_data) normalized = deep_single_cell.normalize_deep_single_cells()

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

class SingleCellDeepProfiler:
    """Class that holds functions needed to analyze single cells from the DeepProfiler (DP) run.

    Only pycytominer.normalization() is implemented.

    Attributes
    ----------
    deep_data : DeepProfilerData
        DeepProfilerData object to load data from DeepProfiler project
    aggregated_profiles : pandas.DataFrame
        df to hold the metadata and profiles.
    file_aggregate : dict
        dict that holds the file names and metadata.
        Is used to load in the npz files in the correct order and grouping.
    output_file : str
        If provided, will write annotated profiles to folder. Defaults to None.

    Methods
    -------
    normalize(profiles, features, image_features, meta_features, samples, method, output_file, compression_options,
    float_format, mad_robustize_epsilon, spherize_center, spherize_method, spherize_epsilon)
        normalize profiling features from DeepProfiler run with pycytominer.normalize()

    Example
    -------
    import pathlib
    from pycytominer.cyto_utils import DeepProfiler_processing

    index_file = pathlib.Path("path/to/index.csv")
    profile_dir = pathlib.Path("path/to/features/")

    deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz")
    deep_single_cell = DeepProfiler_processing.SingleCellDeepProfiler(deep_data)
    normalized = deep_single_cell.normalize_deep_single_cells()
    """

    def __init__(
        self,
        deep_data: DeepProfilerData,
    ):
        """
        __init__ function for this class.

        Arguments
        ---------
        See above for all parameters.
        """
        self.deep_data = deep_data

    def get_single_cells(
        self, output=False, location_x_col_index=0, location_y_col_index=1
    ):
        """Set up a single_cells dataframe in the format expected by pycytominer.normalize().

        Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe.

        Arguments
        -----------
        output : bool
            If true, will output the single cell dataframe instead of setting to self attribute
        location_x_col_index: int
            index of the x location column (which column in DP output has X coords)
        location_y_col_index: int
            index of the y location column (which column in DP output has Y coords)
        """
        # build filenames if they do not already exist
        if not hasattr(self.deep_data, "filenames"):
            self.deep_data.build_filenames()

        # compile features dataframe with single cell locations
        total_df = []
        for features_path in self.deep_data.filenames:
            features = load_npz_features(features_path)
            # skip a file if there are no features
            if len(features.index) == 0:
                warnings.warn(
                    f"No features could be found at {features_path}.\nThis program will continue, but be aware that this might induce errors!"
                )
                continue
            locations = load_npz_locations(
                features_path, location_x_col_index, location_y_col_index
            )
            detailed_df = pd.concat([locations, features], axis=1)

            total_df.append(detailed_df)

        sc_df = pd.concat(total_df).reset_index(drop=True)
        if output:
            return sc_df
        else:
            self.single_cells = sc_df

    def normalize_deep_single_cells(
        self,
        location_x_col_index=0,
        location_y_col_index=1,
        image_features=False,  # not implemented with DeepProfiler
        meta_features="infer",
        samples="all",
        method="standardize",
        output_file=None,
        compression_options=None,
        float_format=None,
        mad_robustize_epsilon=1e-18,
        spherize_center=True,
        spherize_method="ZCA-cor",
        spherize_epsilon=1e-6,
    ):
        """
        Normalize all cells into a pandas dataframe.

        For each file in the DP project features folder, the features from each cell are loaded.
        These features are put into a profiles dataframe for use in pycytominer.normalize.
        A features list is also compiled for use in pycytominer.normalize.

        Returns
        -------
        df_out : pandas.dataframe
            dataframe with all metadata and the feature space.
            This is the input to any further pycytominer or pycytominer-eval processing
        """
        print("getting single cells")
        # setup single_cells attribute
        if not hasattr(self, "single_cells"):
            self.get_single_cells(
                output=False,
                location_x_col_index=location_x_col_index,
                location_y_col_index=location_y_col_index,
            )

        # extract metadata prior to normalization
        metadata_cols = infer_cp_features(self.single_cells, metadata=True)
        # locations are not automatically inferred with cp features
        metadata_cols.append("Location_Center_X")
        metadata_cols.append("Location_Center_Y")
        derived_features = [
            x for x in self.single_cells.columns.tolist() if x not in metadata_cols
        ]

        # wrapper for pycytominer.normalize() function
        normalized = normalize.normalize(
            profiles=self.single_cells,
            features=derived_features,
            image_features=image_features,
            meta_features=meta_features,
            samples=samples,
            method=method,
            output_file=None,
            compression_options=compression_options,
            float_format=float_format,
            mad_robustize_epsilon=mad_robustize_epsilon,
            spherize_center=spherize_center,
            spherize_method=spherize_method,
            spherize_epsilon=spherize_epsilon,
        )

        # move x locations and y locations to metadata columns of normalized df
        x_locations = self.single_cells["Location_Center_X"]
        normalized.insert(0, "Location_Center_X", x_locations)
        y_locations = self.single_cells["Location_Center_Y"]
        normalized.insert(1, "Location_Center_Y", y_locations)

        # separate code because normalize() will not return if it has an output file specified
        if output_file is not None:
            output(
                df=normalized,
                output_filename=output_file,
                compression_options=compression_options,
                float_format=float_format,
            )

        return normalized

`init(deep_data)` ¶

init function for this class.

Arguments

See above for all parameters.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def __init__(
    self,
    deep_data: DeepProfilerData,
):
    """
    __init__ function for this class.

    Arguments
    ---------
    See above for all parameters.
    """
    self.deep_data = deep_data

`get_single_cells(output=False, location_x_col_index=0, location_y_col_index=1)` ¶

Set up a single_cells dataframe in the format expected by pycytominer.normalize().

Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe.

Arguments

output : bool If true, will output the single cell dataframe instead of setting to self attribute location_x_col_index: int index of the x location column (which column in DP output has X coords) location_y_col_index: int index of the y location column (which column in DP output has Y coords)

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def get_single_cells(
    self, output=False, location_x_col_index=0, location_y_col_index=1
):
    """Set up a single_cells dataframe in the format expected by pycytominer.normalize().

    Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe.

    Arguments
    -----------
    output : bool
        If true, will output the single cell dataframe instead of setting to self attribute
    location_x_col_index: int
        index of the x location column (which column in DP output has X coords)
    location_y_col_index: int
        index of the y location column (which column in DP output has Y coords)
    """
    # build filenames if they do not already exist
    if not hasattr(self.deep_data, "filenames"):
        self.deep_data.build_filenames()

    # compile features dataframe with single cell locations
    total_df = []
    for features_path in self.deep_data.filenames:
        features = load_npz_features(features_path)
        # skip a file if there are no features
        if len(features.index) == 0:
            warnings.warn(
                f"No features could be found at {features_path}.\nThis program will continue, but be aware that this might induce errors!"
            )
            continue
        locations = load_npz_locations(
            features_path, location_x_col_index, location_y_col_index
        )
        detailed_df = pd.concat([locations, features], axis=1)

        total_df.append(detailed_df)

    sc_df = pd.concat(total_df).reset_index(drop=True)
    if output:
        return sc_df
    else:
        self.single_cells = sc_df

`normalize_deep_single_cells(location_x_col_index=0, location_y_col_index=1, image_features=False, meta_features='infer', samples='all', method='standardize', output_file=None, compression_options=None, float_format=None, mad_robustize_epsilon=1e-18, spherize_center=True, spherize_method='ZCA-cor', spherize_epsilon=1e-06)` ¶

Normalize all cells into a pandas dataframe.

For each file in the DP project features folder, the features from each cell are loaded. These features are put into a profiles dataframe for use in pycytominer.normalize. A features list is also compiled for use in pycytominer.normalize.

Returns:

Name	Type	Description
`df_out`	`dataframe`	dataframe with all metadata and the feature space. This is the input to any further pycytominer or pycytominer-eval processing

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py

def normalize_deep_single_cells(
    self,
    location_x_col_index=0,
    location_y_col_index=1,
    image_features=False,  # not implemented with DeepProfiler
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file=None,
    compression_options=None,
    float_format=None,
    mad_robustize_epsilon=1e-18,
    spherize_center=True,
    spherize_method="ZCA-cor",
    spherize_epsilon=1e-6,
):
    """
    Normalize all cells into a pandas dataframe.

    For each file in the DP project features folder, the features from each cell are loaded.
    These features are put into a profiles dataframe for use in pycytominer.normalize.
    A features list is also compiled for use in pycytominer.normalize.

    Returns
    -------
    df_out : pandas.dataframe
        dataframe with all metadata and the feature space.
        This is the input to any further pycytominer or pycytominer-eval processing
    """
    print("getting single cells")
    # setup single_cells attribute
    if not hasattr(self, "single_cells"):
        self.get_single_cells(
            output=False,
            location_x_col_index=location_x_col_index,
            location_y_col_index=location_y_col_index,
        )

    # extract metadata prior to normalization
    metadata_cols = infer_cp_features(self.single_cells, metadata=True)
    # locations are not automatically inferred with cp features
    metadata_cols.append("Location_Center_X")
    metadata_cols.append("Location_Center_Y")
    derived_features = [
        x for x in self.single_cells.columns.tolist() if x not in metadata_cols
    ]

    # wrapper for pycytominer.normalize() function
    normalized = normalize.normalize(
        profiles=self.single_cells,
        features=derived_features,
        image_features=image_features,
        meta_features=meta_features,
        samples=samples,
        method=method,
        output_file=None,
        compression_options=compression_options,
        float_format=float_format,
        mad_robustize_epsilon=mad_robustize_epsilon,
        spherize_center=spherize_center,
        spherize_method=spherize_method,
        spherize_epsilon=spherize_epsilon,
    )

    # move x locations and y locations to metadata columns of normalized df
    x_locations = self.single_cells["Location_Center_X"]
    normalized.insert(0, "Location_Center_X", x_locations)
    y_locations = self.single_cells["Location_Center_Y"]
    normalized.insert(1, "Location_Center_Y", y_locations)

    # separate code because normalize() will not return if it has an output file specified
    if output_file is not None:
        output(
            df=normalized,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )

    return normalized

`pycytominer.cyto_utils.annotate_custom` ¶

Functions to annotate data frames with custom options according to CMAP specifications.

`annotate_cmap(annotated, annotate_join_on, cell_id='unknown', perturbation_mode='none')` ¶

Annotates data frame with custom options according to CMAP specifications.

Parameters:

Name	Type	Description	Default
`annotated`	`DataFrame`	DataFrame of profiles.	required
`annotate_join_on`	`str`	Typically the well metadata, but how to join external data	required
`cell_id`	`str`	provide a string to annotate cell id column	`"unknown"`
`perturbation_mode`	`str`	How to annotate CMAP specific data (options = ["chemical" , "genetic"])	`"none"`

Returns:

Type	Description
`annotated`	CMAP annotated data

Source code in pycytominer/cyto_utils/annotate_custom.py

def annotate_cmap(
    annotated, annotate_join_on, cell_id="unknown", perturbation_mode="none"
):
    """Annotates data frame with custom options according to CMAP specifications.

    Parameters
    ----------
    annotated : pandas.core.frame.DataFrame
        DataFrame of profiles.
    annotate_join_on : str
        Typically the well metadata, but how to join external data
    cell_id : str, default "unknown"
        provide a string to annotate cell id column
    perturbation_mode : str, default "none"
        How to annotate CMAP specific data (options = ["chemical" , "genetic"])

    Returns
    -------
    annotated
        CMAP annotated data
    """
    pert_opts = ["none", "chemical", "genetic"]
    assert (  # noqa: S101
        perturbation_mode in pert_opts
    ), f"perturbation mode must be one of {pert_opts}"

    assert (  # noqa: S101
        "Metadata_broad_sample" in annotated.columns
    ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'"

    annotated = annotated.assign(
        Metadata_pert_id=annotated.Metadata_broad_sample.str.extract(
            r"(BRD[-N][A-Z0-9]+)"
        ),
        Metadata_pert_mfc_id=annotated.Metadata_broad_sample,
        Metadata_pert_well=annotated.loc[:, annotate_join_on],
        Metadata_pert_id_vendor="",
    )

    if "Metadata_pert_iname" in annotated.columns:
        annotated = annotated.assign(
            Metadata_pert_mfc_desc=annotated.Metadata_pert_iname,
            Metadata_pert_name=annotated.Metadata_pert_iname,
        )

    if "Metadata_cell_id" not in annotated.columns:
        annotated = annotated.assign(Metadata_cell_id=cell_id)

    if perturbation_mode == "chemical":
        annotated = annotated.assign(
            Metadata_broad_sample_type=[
                "control" if x in ["DMSO", np.nan] else "trt"
                for x in annotated.Metadata_broad_sample
            ]
        )

        # Generate Metadata_broad_sample column
        annotated.loc[
            annotated.Metadata_broad_sample_type == "control",
            "Metadata_broad_sample",
        ] = "DMSO"
        annotated.loc[
            annotated.Metadata_broad_sample == "empty", "Metadata_broad_sample_type"
        ] = "empty"

        if "Metadata_mmoles_per_liter" in annotated.columns:
            annotated.loc[
                annotated.Metadata_broad_sample_type == "control",
                "Metadata_mmoles_per_liter",
            ] = 0

        if "Metadata_solvent" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_vehicle=annotated.Metadata_solvent
            )
        if "Metadata_mg_per_ml" in annotated.columns:
            annotated.loc[
                annotated.Metadata_broad_sample_type == "control",
                "Metadata_mg_per_ml",
            ] = 0

    if perturbation_mode == "genetic" and "Metadata_pert_name" in annotated.columns:
        annotated = annotated.assign(
            Metadata_broad_sample_type=[
                "control" if x == "EMPTY" else "trt"
                for x in annotated.Metadata_pert_name
            ]
        )

    if "Metadata_broad_sample_type" in annotated.columns:
        annotated = annotated.assign(
            Metadata_pert_type=annotated.Metadata_broad_sample_type
        )
    else:
        annotated = annotated.assign(
            Metadata_pert_type="", Metadata_broad_sample_type=""
        )

    return annotated

`cp_clean(profiles)` ¶

Specifically clean certain column names derived from different CellProfiler versions.

Parameters:

Name	Type	Description	Default
`profiles`	`DataFrame`	DataFrame of profiles.	required

Returns:

Type	Description
`profiles`	Renamed to standard metadata

Source code in pycytominer/cyto_utils/annotate_custom.py

def cp_clean(profiles):
    """Specifically clean certain column names derived from different CellProfiler versions.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame
        DataFrame of profiles.

    Returns
    -------
    profiles
        Renamed to standard metadata
    """
    profiles = profiles.rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well",
        },
        axis="columns",
    )

    return profiles

`pycytominer.cyto_utils.cell_locations` ¶

Utility function to augment a metadata file with X,Y locations of cells in each image.

`CellLocation` ¶

Class holding all the functions augment a metadata file with X,Y locations of cells in each image.

In the metadata file, which is either a CSV or a Parquet file, - Each row is single multi-channel image - Each image is indexed by multiple columns, e.g., Metadata_Plate, Metadata_Well,Metadata_Site

The single_cell SQLite file contains at least two tables - Nuclei, which has the single-cell-level readouts, including location information - Image, which has the image-level readouts, as well metadata to link to the metadata file

In the Nuclei table, - Each row is a cell - Each cell has at least 3 columns: Nuclei_Location_Center_X, Nuclei_Location_Center_Y, ImageNumber

In the Image table, - Each row is an image - Each image has at least the same columns as the images in the metadata file are indexed by, e.g., Metadata_Plate,Metadata_Well,Metadata_Site

The methods in this class do the following - Read the metadata file - Read the single_cell file - For each image in the metadata file, find the corresponding image in the single_cell file - For each cell in the corresponding image, find the X,Y location - Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column

Attributes:

Name	Type	Description
`metadata_input`	`str or Pandas DataFrame`	Path to the input metadata file or a Pandas DataFrame
`single_cell_input`	`str or Engine`	Path to the single_cell file or a sqlalchemy.engine.Engine object
`augmented_metadata_output`	`str`	Path to the output file. If None, the metadata file is not saved to disk
`image_column`	`default = 'ImageNumber'`	Name of the column in the metadata file that links to the single_cell file, in combination with `table_column`
`image_key`	`default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site']`	Names of the columns in the metadata file that uniquely identify each image
`object_column`	`default = 'ObjectNumber'`	Name of the column in the single_cell file that identifies each cell
`cell_x_loc`	`default = 'Nuclei_Location_Center_X'`	Name of the column in the single_cell file that contains the X location of each cell
`cell_y_loc`	`default = 'Nuclei_Location_Center_Y'`	Name of the column in the single_cell file that contains the Y location of each cell
`table_column`	`default = 'TableNumber'`	Name of the column in the metadata file that links to the single_cell file, in combination with `image_column`

Methods:

Name	Description
`add_cell_location`	Augment the metadata file and optionally save it to a file

Source code in pycytominer/cyto_utils/cell_locations.py

class CellLocation:
    """Class holding all the functions augment a metadata file with X,Y locations of cells in each image.

    In the metadata file, which is either a CSV or a Parquet file,
    - Each row is single multi-channel image
    - Each image is indexed by multiple columns, e.g., `Metadata_Plate`, `Metadata_Well`,`Metadata_Site`

    The single_cell SQLite file contains at least two tables
    - `Nuclei`, which has the single-cell-level readouts, including location information
    - `Image`, which has the image-level readouts, as well metadata to link to the metadata file

    In the `Nuclei` table,
    - Each row is a cell
    - Each cell has at least 3 columns: `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`, `ImageNumber`

    In the `Image` table,
    - Each row is an image
    - Each image has at least the same columns as the images in the metadata file are indexed by, e.g., `Metadata_Plate`,`Metadata_Well`,`Metadata_Site`

    The methods in this class do the following
    - Read the metadata file
    - Read the single_cell file
    - For each image in the metadata file, find the corresponding image in the single_cell file
    - For each cell in the corresponding image, find the X,Y location
    - Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column


    Attributes
    ----------
    metadata_input : str or Pandas DataFrame
        Path to the input metadata file or a Pandas DataFrame

    single_cell_input : str or sqlalchemy.engine.Engine
        Path to the single_cell file or a sqlalchemy.engine.Engine object

    augmented_metadata_output : str
        Path to the output file. If None, the metadata file is not saved to disk

    image_column : default = 'ImageNumber'
        Name of the column in the metadata file that links to the single_cell file, in combination with `table_column`

    image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site']
        Names of the columns in the metadata file that uniquely identify each image

    object_column : default = 'ObjectNumber'
        Name of the column in the single_cell file that identifies each cell

    cell_x_loc : default = 'Nuclei_Location_Center_X'
        Name of the column in the single_cell file that contains the X location of each cell

    cell_y_loc : default = 'Nuclei_Location_Center_Y'
        Name of the column in the single_cell file that contains the Y location of each cell

    table_column : default = 'TableNumber'
        Name of the column in the metadata file that links to the single_cell file, in combination with `image_column`

    Methods
    -------
    add_cell_location()
        Augment the metadata file and optionally save it to a file

    """

    def __init__(
        self,
        metadata_input: Union[str, pd.DataFrame],
        single_cell_input: Union[str, sqlalchemy.engine.Engine],
        augmented_metadata_output: Optional[str] = None,
        overwrite: bool = False,
        image_column: str = "ImageNumber",
        object_column: str = "ObjectNumber",
        table_column: str = "TableNumber",
        image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
        cell_x_loc: str = "Nuclei_Location_Center_X",
        cell_y_loc: str = "Nuclei_Location_Center_Y",
    ):
        self.metadata_input = self._expanduser(metadata_input)
        self.augmented_metadata_output = self._expanduser(augmented_metadata_output)
        self.single_cell_input = self._expanduser(single_cell_input)
        self.overwrite = overwrite
        self.image_column = image_column
        self.object_column = object_column
        self.table_column = table_column
        self.image_key = image_key
        self.cell_x_loc = cell_x_loc
        self.cell_y_loc = cell_y_loc
        # Currently constrained to only anonymous access for S3 resources
        # https://github.com/cytomining/pycytominer/issues/268
        self.s3 = boto3.client(
            "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
        )

    def _expanduser(self, obj: Union[str, None]):
        """Expand the user home directory in a path."""
        if obj is not None and isinstance(obj, str) and not obj.startswith("s3://"):
            return pathlib.Path(obj).expanduser().as_posix()
        return obj

    def _parse_s3_path(self, s3_path: str):
        """Parse an S3 path into a bucket and key.

        Parameters
        ----------
        s3_path : str
            The S3 path

        Returns
        -------
        str
            The bucket
        str
            The key
        """
        s3_path = s3_path.replace("s3://", "")

        bucket = s3_path.split("/")[0]

        key = "/".join(s3_path.split("/")[1:])

        return bucket, key

    def _s3_file_exists(self, s3_path: str):
        """Check if a file exists on S3.

        Parameters
        ----------
        s3_path : str
            The path to the file on S3

        Returns
        -------
        bool
            True if the file exists on S3, False otherwise
        """
        bucket, key = self._parse_s3_path(s3_path)

        try:
            self.s3.head_object(Bucket=bucket, Key=key)
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] in ["404", "400", "403"]:
                return False
            else:
                raise
        else:
            return True

    def _download_s3(self, uri: str):
        """Download a file from S3 to a temporary file and return the temporary path."""
        bucket, key = self._parse_s3_path(uri)

        tmp_file = tempfile.NamedTemporaryFile(
            delete=False, suffix=pathlib.Path(key).name
        )

        self.s3.download_file(bucket, key, tmp_file.name)

        return tmp_file.name

    def _load_metadata(self):
        """Load the metadata into a Pandas DataFrame.

        Returns
        -------
        Pandas DataFrame
            The metadata loaded into a Pandas DataFrame
        """
        if not isinstance(self.metadata_input, pd.DataFrame):
            # verify that the metadata file is a CSV or a Parquet file

            if not (
                self.metadata_input.endswith(".csv")
                or self.metadata_input.endswith(".parquet")
            ):
                raise ValueError("Metadata file must be a CSV or a Parquet file")

            storage_options = (
                {"anon": True} if self.metadata_input.startswith("s3://") else None
            )

            # load the metadata file into a Pandas DataFrame
            if self.metadata_input.endswith(".csv"):
                df = pd.read_csv(
                    self.metadata_input, dtype=str, storage_options=storage_options
                )
            else:
                df = pd.read_parquet(
                    self.metadata_input, storage_options=storage_options
                )

            # cast all columns to string
            df = df.astype(str)
        else:
            df = self.metadata_input

        # verify that the image index columns are present in the metadata object

        if not all(elem in df.columns for elem in self.image_key):
            raise ValueError(
                f"Image index columns {self.image_key} are not present in the metadata file"
            )

        return df

    def _create_nested_df(self, df: pd.DataFrame):
        """Create a new column `CellCenters` by nesting the X and Y locations of cell from an image into the row of the image.

        Parameters
        ----------
        df : Pandas DataFrame
            The DataFrame to convert

        Returns
        -------
        Pandas DataFrame
        """
        # define a dictionary to store the output
        output_df_list = collections.defaultdict(list)

        # iterate over each group of cells in the merged DataFrame
        group_cols = [*self.image_key, self.image_column, self.table_column]

        for group_values, cell_df in df.groupby(group_cols):
            # add the image-level information to the output dictionary
            for key, value in zip(group_cols, group_values):
                output_df_list[key].append(value)

            # convert the cell DataFrame to a dictionary
            cell_dict = cell_df.to_dict(orient="list")

            # iterate over each cell in the cell DataFrame
            row_cell_dicts = []
            for object_column, cell_x_loc, cell_y_loc in zip(
                cell_dict[self.object_column],
                cell_dict[self.cell_x_loc],
                cell_dict[self.cell_y_loc],
            ):
                # add the cell information to a dictionary
                row_cell_dicts.append({
                    self.object_column: object_column,
                    self.cell_x_loc: cell_x_loc,
                    self.cell_y_loc: cell_y_loc,
                })

            # add the cell-level information to the output dictionary
            output_df_list["CellCenters"].append(row_cell_dicts)

        # convert the output dictionary to a Pandas DataFrame
        return pd.DataFrame(output_df_list)

    def _get_single_cell_engine(self):
        """Get the sqlalchemy.engine.Engine object for the single_cell file."""
        if isinstance(self.single_cell_input, str):
            # check if the single_cell file is a SQLite file
            if not self.single_cell_input.endswith(".sqlite"):
                raise ValueError("single_cell file must be a SQLite file")

            # if the single_cell file is an S3 path, download it to a temporary file
            if self.single_cell_input.startswith("s3://"):
                temp_single_cell_input = self._download_s3(self.single_cell_input)

                # connect to the single_cell file
                engine = sqlalchemy.create_engine(f"sqlite:///{temp_single_cell_input}")
            else:
                # connect to the single_cell file
                engine = sqlalchemy.create_engine(f"sqlite:///{self.single_cell_input}")
                temp_single_cell_input = None

        else:
            engine = self.single_cell_input
            temp_single_cell_input = None

        return temp_single_cell_input, engine

    def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):
        """Check that the single_cell file has the required tables and columns."""
        inspector = sqlalchemy.inspect(engine)

        if not all(
            table_name in inspector.get_table_names()
            for table_name in ["Image", "Nuclei"]
        ):
            raise ValueError(
                "Image and Nuclei tables are not present in the single_cell file"
            )

        # Verify that the required columns are present in the single_cell file

        nuclei_columns = [column["name"] for column in inspector.get_columns("Nuclei")]

        if not all(
            column_name in nuclei_columns
            for column_name in [
                self.image_column,
                self.table_column,
                self.object_column,
                self.cell_x_loc,
                self.cell_y_loc,
            ]
        ):
            raise ValueError(
                "Required columns are not present in the Nuclei table in the SQLite file"
            )

        image_columns = [column["name"] for column in inspector.get_columns("Image")]

        if not (
            self.image_column in image_columns
            and self.table_column in image_columns
            and all(elem in image_columns for elem in self.image_key)
        ):
            raise ValueError(
                "Required columns are not present in the Image table in the SQLite file"
            )

    def _get_joined_image_nuclei_tables(self):
        """Merge the Image and Nuclei tables in SQL."""
        # get the sqlalchemy.engine.Engine object for the single_cell file
        temp_single_cell_input, engine = self._get_single_cell_engine()

        # check that the single_cell file has the required tables and columns
        self._check_single_cell_correctness(engine)

        image_index_str = ", ".join(self.image_key)

        # merge the Image and Nuclei tables in SQL

        join_query = f"""
        SELECT Nuclei.{self.table_column},Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str}
        FROM Nuclei
        INNER JOIN Image
        ON Nuclei.{self.image_column} = Image.{self.image_column} and Nuclei.{self.table_column} = Image.{self.table_column};
        """

        column_types = {
            self.image_column: "int64",
            self.table_column: "int64",
            self.object_column: "int64",
            self.cell_x_loc: "float",
            self.cell_y_loc: "float",
        }

        for image_key in self.image_key:
            column_types[image_key] = "str"

        joined_df = pd.read_sql_query(join_query, engine, dtype=column_types)

        # if the single_cell file was downloaded from S3, delete the temporary file
        if temp_single_cell_input is not None:
            pathlib.Path(temp_single_cell_input).unlink()

        return joined_df

    def _load_single_cell(self):
        """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlalchemy.engine.Engine object into a Pandas DataFrame.

        Returns
        -------
        Pandas DataFrame
            The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame
        """
        return self._create_nested_df(self._get_joined_image_nuclei_tables())

    def add_cell_location(self):
        """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

        Optionally, save the augmented metadata file as a Parquet file.

        Returns
        -------
        Pandas DataFrame
            Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column
        """
        # If self.augmented_metadata_output is not None and it is a str and the file already exists, there is nothing to do
        if (
            self.augmented_metadata_output is not None
            and isinstance(self.augmented_metadata_output, str)
            and self.overwrite is False
            and (
                # Check if the file exists on S3 or locally
                (
                    self.augmented_metadata_output.startswith("s3://")
                    and self._s3_file_exists(self.augmented_metadata_output)
                )
                or (
                    not self.augmented_metadata_output.startswith("s3://")
                    and pathlib.Path(self.augmented_metadata_output).exists()
                )
            )
        ):
            # TODO: Consider doing a quick difference check should the file already exist.
            # For example, if the file already exists and it's different than what could be possibly incoming, should the user know?
            # This will involve performing all the steps below and then doing a check to see if the file is different, so this is a bit of a pain.
            return self.augmented_metadata_output

        # Load the data
        metadata_df = self._load_metadata()
        single_cell_df = self._load_single_cell()

        # Merge the data and single_cell tables
        augmented_metadata_df = pd.merge(
            metadata_df,
            single_cell_df,
            on=self.image_key,
            how="left",
        )

        # If self.augmented_metadata_output is not None, save the data
        if self.augmented_metadata_output is not None:
            # TODO: switch to https://github.com/cytomining/pycytominer/blob/main/pycytominer/cyto_utils/output.py if we want to support more file types
            augmented_metadata_df.to_parquet(
                self.augmented_metadata_output, index=False
            )
            return self.augmented_metadata_output
        else:
            return augmented_metadata_df

`add_cell_location()` ¶

Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

Optionally, save the augmented metadata file as a Parquet file.

Returns:

Type	Description
`Pandas DataFrame`	Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column

Source code in pycytominer/cyto_utils/cell_locations.py

def add_cell_location(self):
    """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

    Optionally, save the augmented metadata file as a Parquet file.

    Returns
    -------
    Pandas DataFrame
        Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column
    """
    # If self.augmented_metadata_output is not None and it is a str and the file already exists, there is nothing to do
    if (
        self.augmented_metadata_output is not None
        and isinstance(self.augmented_metadata_output, str)
        and self.overwrite is False
        and (
            # Check if the file exists on S3 or locally
            (
                self.augmented_metadata_output.startswith("s3://")
                and self._s3_file_exists(self.augmented_metadata_output)
            )
            or (
                not self.augmented_metadata_output.startswith("s3://")
                and pathlib.Path(self.augmented_metadata_output).exists()
            )
        )
    ):
        # TODO: Consider doing a quick difference check should the file already exist.
        # For example, if the file already exists and it's different than what could be possibly incoming, should the user know?
        # This will involve performing all the steps below and then doing a check to see if the file is different, so this is a bit of a pain.
        return self.augmented_metadata_output

    # Load the data
    metadata_df = self._load_metadata()
    single_cell_df = self._load_single_cell()

    # Merge the data and single_cell tables
    augmented_metadata_df = pd.merge(
        metadata_df,
        single_cell_df,
        on=self.image_key,
        how="left",
    )

    # If self.augmented_metadata_output is not None, save the data
    if self.augmented_metadata_output is not None:
        # TODO: switch to https://github.com/cytomining/pycytominer/blob/main/pycytominer/cyto_utils/output.py if we want to support more file types
        augmented_metadata_df.to_parquet(
            self.augmented_metadata_output, index=False
        )
        return self.augmented_metadata_output
    else:
        return augmented_metadata_df

`pycytominer.cyto_utils.cell_locations_cmd` ¶

CLI for cell location calculations.

`pycytominer.cyto_utils.cells` ¶

Module containing the SingleCells class, which is used to interact with single cell morphological profiles.

`SingleCells` ¶

Class to interact with single cell morphological profiles including aggregation, normalization, and output.

Attributes:

Name	Type	Description
`sql_file`	`str`	SQLite connection pointing to the single cell database. The string prefix must be "sqlite:///".
`strata`	`list of str, default ["Metadata_Plate", "Metadata_Well"]`	The columns to groupby and aggregate single cells.
`aggregation_operation`	`str, default "median"`	Operation to perform single cell aggregation.
`output_file`	`str, default None`	If specified, the location to write the file.
`compartments`	`list of str, default ["cells", "cytoplasm", "nuclei"]`	List of compartments to process.
`compartment_linking_cols`	`dict, default noted below`	Dictionary identifying how to merge columns across tables.
`merge_cols`	`list of str, default ["TableNumber", "ImageNumber"]`	Columns indicating how to merge image and compartment data.
`image_cols`	`list of str, default ["TableNumber", "ImageNumber", "Metadata_Site"]`	Columns to select from the image table.
`add_image_features`	`bool, default False`	Whether to add image features to the profiles.
`image_feature_categories`	`list of str, optional`	List of categories of features from the image table to add to the profiles.
`features`	`str or list of str, default "infer"`	List of features that should be loaded or aggregated.
`load_image_data`	`bool, default True`	Whether or not the image data should be loaded into memory.
`image_table_name`	`str, default "image"`	The name of the table inside the SQLite file of image measurements.
`subsample_frac`	`float, default 1`	The percentage of single cells to select (0 < subsample_frac <= 1).
`subsample_n`	`str or int, default "all"`	How many samples to subsample - do not specify both subsample_frac and subsample_n.
`subsampling_random_state`	`str or int, default None`	The random state to init subsample.
`fields_of_view`	`list of int, str, default "all"`	List of fields of view to aggregate.
`fields_of_view_feature`	`str, default "Metadata_Site"`	Name of the fields of view feature.
`object_feature`	`str, default "Metadata_ObjectNumber"`	Object number feature.
`default_datatype_float`	`type`	Numpy floating point datatype to use for load_compartment and resulting dataframes. This parameter may be used to assist with performance-related issues by reducing the memory required for floating-point data. For example, using np.float32 instead of np.float64 for this parameter will reduce memory consumed by float columns by roughly 50%. Please note: using any besides np.float64 are experimentally unverified.

Notes

.. note:: the argument compartment_linking_cols is designed to work with CellProfiler output, as curated by cytominer-database. The default is: { "cytoplasm": { "cells": "Cytoplasm_Parent_Cells", "nuclei": "Cytoplasm_Parent_Nuclei", }, "cells": {"cytoplasm": "ObjectNumber"}, "nuclei": {"cytoplasm": "ObjectNumber"}, }

Source code in pycytominer/cyto_utils/cells.py

class SingleCells:
    """Class to interact with single cell morphological profiles including aggregation, normalization, and output.

    Attributes
    ----------
    sql_file : str
        SQLite connection pointing to the single cell database.
        The string prefix must be "sqlite:///".
    strata : list of str, default ["Metadata_Plate", "Metadata_Well"]
        The columns to groupby and aggregate single cells.
    aggregation_operation : str, default "median"
        Operation to perform single cell aggregation.
    output_file : str, default None
        If specified, the location to write the file.
    compartments : list of str, default ["cells", "cytoplasm", "nuclei"]
        List of compartments to process.
    compartment_linking_cols : dict, default noted below
        Dictionary identifying how to merge columns across tables.
    merge_cols : list of str, default ["TableNumber", "ImageNumber"]
        Columns indicating how to merge image and compartment data.
    image_cols : list of str, default ["TableNumber", "ImageNumber", "Metadata_Site"]
        Columns to select from the image table.
    add_image_features: bool, default False
        Whether to add image features to the profiles.
    image_feature_categories : list of str, optional
        List of categories of features from the image table to add to the profiles.
    features: str or list of str, default "infer"
        List of features that should be loaded or aggregated.
    load_image_data : bool, default True
        Whether or not the image data should be loaded into memory.
    image_table_name : str, default "image"
        The name of the table inside the SQLite file of image measurements.
    subsample_frac : float, default 1
        The percentage of single cells to select (0 < subsample_frac <= 1).
    subsample_n : str or int, default "all"
        How many samples to subsample - do not specify both subsample_frac and subsample_n.
    subsampling_random_state : str or int, default None
        The random state to init subsample.
    fields_of_view : list of int, str, default "all"
        List of fields of view to aggregate.
    fields_of_view_feature : str, default "Metadata_Site"
        Name of the fields of view feature.
    object_feature : str, default "Metadata_ObjectNumber"
        Object number feature.
    default_datatype_float: type
        Numpy floating point datatype to use for load_compartment and resulting
        dataframes. This parameter may be used to assist with performance-related
        issues by reducing the memory required for floating-point data.
        For example, using np.float32 instead of np.float64 for this parameter
        will reduce memory consumed by float columns by roughly 50%.
        Please note: using any besides np.float64 are experimentally
        unverified.

    Notes
    -----
    .. note::
        the argument compartment_linking_cols is designed to work with CellProfiler output,
        as curated by cytominer-database. The default is: {
            "cytoplasm": {
                "cells": "Cytoplasm_Parent_Cells",
                "nuclei": "Cytoplasm_Parent_Nuclei",
            },
            "cells": {"cytoplasm": "ObjectNumber"},
            "nuclei": {"cytoplasm": "ObjectNumber"},
        }
    """

    def __init__(
        self,
        sql_file,
        strata=["Metadata_Plate", "Metadata_Well"],
        aggregation_operation="median",
        output_file=None,
        compartments=default_compartments,
        compartment_linking_cols=default_linking_cols,
        merge_cols=["TableNumber", "ImageNumber"],
        image_cols=["TableNumber", "ImageNumber", "Metadata_Site"],
        add_image_features=False,
        image_feature_categories=None,
        features="infer",
        load_image_data=True,
        image_table_name="image",
        subsample_frac=1,
        subsample_n="all",
        subsampling_random_state=None,
        fields_of_view="all",
        fields_of_view_feature="Metadata_Site",
        object_feature="Metadata_ObjectNumber",
        default_datatype_float=np.float64,
    ):
        """Construct a SingleCells object."""
        # Check compartments specified
        check_compartments(compartments)

        # Check if correct operation is specified
        aggregation_operation = check_aggregate_operation(aggregation_operation)

        # Check that the subsample_frac is between 0 and 1
        assert (  # noqa: S101
            subsample_frac > 0 and subsample_frac <= 1
        ), "subsample_frac must be between 0 and 1"

        self.sql_file = sql_file
        self.strata = strata
        self.load_image_data = load_image_data
        self.image_table_name = image_table_name
        self.aggregation_operation = aggregation_operation.lower()
        self.output_file = output_file
        self.merge_cols = merge_cols
        self.image_cols = image_cols
        self.add_image_features = add_image_features
        self.image_feature_categories = image_feature_categories
        self.features = features
        self.subsample_frac = subsample_frac
        self.subsample_n = subsample_n
        self.subset_data_df = None
        self.subsampling_random_state = subsampling_random_state
        self.is_aggregated = False
        self.is_subset_computed = False
        self.compartments = compartments
        self.compartment_linking_cols = compartment_linking_cols
        self.fields_of_view_feature = fields_of_view_feature
        self.object_feature = object_feature
        self.default_datatype_float = default_datatype_float

        # Confirm that the compartments and linking cols are formatted properly
        assert_linking_cols_complete(
            compartments=self.compartments, linking_cols=self.compartment_linking_cols
        )

        # Build a dictionary to update linking column feature names
        self.linking_col_rename = provide_linking_cols_feature_name_update(
            self.compartment_linking_cols
        )

        if self.subsample_n != "all":
            self.set_subsample_n(self.subsample_n)

        # Connect to sqlite engine
        self.engine = create_engine(self.sql_file)
        self.conn = self.engine.connect()

        # Throw an error if both subsample_frac and subsample_n is set
        self._check_subsampling()

        # Confirm that the input fields of view is valid
        self.fields_of_view = check_fields_of_view_format(fields_of_view)

        # attribute to track image table data load status
        self.image_data_loaded = False
        if self.load_image_data:
            self.load_image(image_table_name=self.image_table_name)

    def _check_subsampling(self):
        """Check if subsampling options were specified correctly.

        Returns
        -------
        None
            Nothing is returned.
        """
        # Check that the user didn't specify both subset frac and subsample all
        assert (  # noqa: S101
            self.subsample_frac == 1 or self.subsample_n == "all"
        ), "Do not set both subsample_frac and subsample_n"

    def set_output_file(self, output_file):
        """Set or modify output file.

        Parameters
        ----------
        output_file : str
            New output file name.

        Returns
        -------
        None
            Nothing is returned.
        """
        self.output_file = output_file

    def set_subsample_frac(self, subsample_frac):
        """Set or update the subsample fraction.

        Parameters
        ----------
        subsample_frac : float, default 1
            Percentage of single cells to select (0 < subsample_frac <= 1).

        Returns
        -------
        None
            Nothing is returned.
        """
        self.subsample_frac = subsample_frac
        self._check_subsampling()

    def set_subsample_n(self, subsample_n):
        """Set or update the subsample n.

        Parameters
        ----------
        subsample_n : int, default "all"
            Indicate how many sample to subsample - do not specify both subsample_frac and subsample_n.

        Returns
        -------
        None
            Nothing is returned.
        """
        try:
            self.subsample_n = int(subsample_n)
        except ValueError:
            raise ValueError("subsample n must be an integer or coercable")
        self._check_subsampling()

    def set_subsample_random_state(self, random_state):
        """Set or update the subsample random state.

        Parameters
        ----------
        random_state: int, optional
            The random state to init subsample.

        Returns
        -------
        None
            Nothing is returned.
        """
        self.subsampling_random_state = random_state

    def load_image(self, image_table_name=None):
        """Load image table from sqlite file.

        Returns
        -------
        None
            Nothing is returned.
        """
        if image_table_name is None:
            image_table_name = self.image_table_name

        image_query = f"select * from {image_table_name}"
        self.image_df = pd.read_sql(sql=image_query, con=self.conn)

        if self.add_image_features:
            self.image_features_df = extract_image_features(
                self.image_feature_categories,
                self.image_df,
                self.image_cols,
                self.strata,
            )

        image_features = list(np.union1d(self.image_cols, self.strata))
        self.image_df = self.image_df[image_features]

        if self.fields_of_view != "all":
            check_fields_of_view(
                list(np.unique(self.image_df[self.fields_of_view_feature])),
                list(self.fields_of_view),
            )
            self.image_df = self.image_df.query(
                f"{self.fields_of_view_feature}==@self.fields_of_view"
            )

            if self.add_image_features:
                self.image_features_df = self.image_features_df.query(
                    f"{self.fields_of_view_feature}==@self.fields_of_view"
                )

        self.image_data_loaded = True

    def count_cells(self, compartment="cells", count_subset=False):
        """Determine how many cells are measured per well.

        Parameters
        ----------
        compartment : str, default "cells"
            Compartment to subset.
        count_subset : bool, default False
            Whether or not count the number of cells as specified by the strata groups.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of cell counts in the experiment.
        """
        check_compartments(compartment)

        if count_subset:
            assert self.is_aggregated, "Make sure to aggregate_profiles() first!"  # noqa: S101
            assert self.is_subset_computed, "Make sure to get_subsample() first!"  # noqa: S101
            count_df = (
                self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"]
                .count()
                .reset_index()
                .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns")
            )
        else:
            query_cols = "TableNumber, ImageNumber, ObjectNumber"
            query = f"select {query_cols} from {compartment}"
            count_df = self.image_df.merge(
                pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
            )
            count_df = (
                count_df.groupby(self.strata)["ObjectNumber"]
                .count()
                .reset_index()
                .rename({"ObjectNumber": "cell_count"}, axis="columns")
            )

        return count_df

    def subsample_profiles(self, df, rename_col=True):
        """Sample a Pandas DataFrame given subsampling information.

        Parameters
        ----------
        df : pandas.core.frame.DataFrame
            DataFrame of a single cell profile.
        rename_col : bool, default True
            Whether or not to rename the columns.

        Returns
        -------
        pandas.core.frame.DataFrame
            A subsampled pandas dataframe of single cell profiles.
        """
        if self.subsampling_random_state is None:
            random_state = np.random.randint(0, 10000, size=1)[0]
            self.set_subsample_random_state(random_state)

        if self.subsample_frac == 1:
            output_df = pd.DataFrame.sample(
                df,
                n=self.subsample_n,
                replace=True,
                random_state=self.subsampling_random_state,
            )
        else:
            output_df = pd.DataFrame.sample(
                df, frac=self.subsample_frac, random_state=self.subsampling_random_state
            )

        if rename_col:
            output_df = output_df.rename(self.linking_col_rename, axis="columns")

        return output_df

    def get_subsample(self, df=None, compartment="cells", rename_col=True):
        """Apply the subsampling procedure.

        Parameters
        ----------
        df : pandas.core.frame.DataFrame
            DataFrame of a single cell profile.
        compartment : str, default "cells"
            The compartment to process.
        rename_col : bool, default True
            Whether or not to rename the columns.

        Returns
        -------
        None
            Nothing is returned.
        """
        check_compartments(compartment)

        query_cols = "TableNumber, ImageNumber, ObjectNumber"
        query = f"select {query_cols} from {compartment}"

        # Load query and merge with image_df
        if df is None:
            df = pd.read_sql(sql=query, con=self.conn)

        query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)

        self.subset_data_df = (
            query_df.groupby(self.strata)
            .apply(lambda x: self.subsample_profiles(x, rename_col=rename_col))
            .reset_index(drop=True)
        )

        self.is_subset_computed = True

    def count_sql_table_rows(self, table):
        """Count total number of rows for a table."""
        (num_rows,) = next(self.conn.execute(f"SELECT COUNT(*) FROM {table}"))
        return num_rows

    def get_sql_table_col_names(self, table):
        """Get column names from the database."""
        ptr = self.conn.execute(f"SELECT * FROM {table} LIMIT 1").cursor
        col_names = [obj[0] for obj in ptr.description]

        return col_names

    def split_column_categories(self, col_names):
        """Split a list of column names into feature and metadata columns lists."""
        feat_cols = []
        meta_cols = []
        for col in col_names:
            if col.lower().startswith(tuple(self.compartments)):
                feat_cols.append(col)
            else:
                meta_cols.append(col)

        return meta_cols, feat_cols

    def load_compartment(self, compartment):
        """Create the compartment dataframe.

        Note: makes use of default_datatype_float attribute
        for setting a default floating point datatype.

        Parameters
        ----------
        compartment : str
            The compartment to process.

        Returns
        -------
        pandas.core.frame.DataFrame
            Compartment dataframe.
        """
        # Get data useful to pre-alloc memory
        num_cells = self.count_sql_table_rows(compartment)
        col_names = self.get_sql_table_col_names(compartment)
        if self.features != "infer":  # allow to get only some features
            col_names = [x for x in col_names if x in self.features]
        meta_cols, feat_cols = self.split_column_categories(col_names)
        num_meta, num_feats = len(meta_cols), len(feat_cols)

        # Use pre-allocated np.array for feature data
        feats = np.empty(
            shape=(num_cells, num_feats), dtype=self.default_datatype_float
        )
        # Use pre-allocated pd.DataFrame for metadata
        metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

        # Query database for selected columns of chosen compartment
        columns = ", ".join(meta_cols + feat_cols)
        query = f"select {columns} from {compartment}"
        query_result = self.conn.execute(query)

        # Load data row by row for both meta information and features
        for i, row in enumerate(query_result):
            metas.loc[i] = row[:num_meta]
            feats[i] = row[num_meta:]

        # Return concatenated data and metainformation of compartment
        return pd.concat([metas, pd.DataFrame(columns=feat_cols, data=feats)], axis=1)

    def aggregate_compartment(
        self,
        compartment,
        compute_subsample=False,
        compute_counts=False,
        add_image_features=False,
        n_aggregation_memory_strata=1,
    ):
        """Aggregate morphological profiles. Uses pycytominer.aggregate().

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        compute_subsample : bool, default False
            Whether or not to subsample.
        compute_counts : bool, default False
            Whether or not to compute the number of objects in each compartment
            and the number of fields of view per well.
        add_image_features : bool, default False
            Whether or not to add image features.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.
            For example, if aggregating by "well", then n_aggregation_memory_strata=1
            means that one "well" will be pulled from the SQLite database into
            memory at a time.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of aggregated profiles.
        """
        check_compartments(compartment)

        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        # Load image data if not already loaded
        if not self.image_data_loaded:
            self.load_image(image_table_name=self.image_table_name)

        # Iteratively call aggregate() on chunks of the full compartment table
        object_dfs = []
        for compartment_df in self._compartment_df_generator(
            compartment=compartment,
            n_aggregation_memory_strata=n_aggregation_memory_strata,
        ):
            population_df = self.image_df.merge(
                compartment_df,
                how="inner",
                on=self.merge_cols,
            ).rename(self.linking_col_rename, axis="columns")

            if self.features == "infer":
                aggregate_features = infer_cp_features(
                    population_df, compartments=compartment
                )
            else:
                aggregate_features = self.features

            partial_object_df = aggregate(
                population_df=population_df,
                strata=self.strata,
                compute_object_count=compute_counts,
                operation=self.aggregation_operation,
                subset_data_df=self.subset_data_df,
                features=aggregate_features,
                object_feature=self.object_feature,
            )

            if compute_counts and self.fields_of_view_feature not in self.strata:
                fields_count_df = aggregate_fields_count(
                    self.image_df, self.strata, self.fields_of_view_feature
                )

                if add_image_features:
                    fields_count_df = aggregate_image_features(
                        fields_count_df,
                        self.image_features_df,
                        self.image_feature_categories,
                        self.image_cols,
                        self.strata,
                        self.aggregation_operation,
                    )

                partial_object_df = fields_count_df.merge(
                    partial_object_df,
                    on=self.strata,
                    how="right",
                )

                # Separate all the metadata and feature columns.
                metadata_cols = infer_cp_features(partial_object_df, metadata=True)
                feature_cols = infer_cp_features(partial_object_df, image_features=True)

                partial_object_df = partial_object_df.reindex(
                    columns=metadata_cols + feature_cols
                )

            object_dfs.append(partial_object_df)

        # Concatenate one or more aggregated dataframes row-wise into final output
        object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True)

        return object_df

    def _compartment_df_generator(
        self,
        compartment,
        n_aggregation_memory_strata=1,
    ):
        """Yield chunks of the entire compartment table from disk.

        We want to return dataframes with all compartment entries within unique
        combinations of self.merge_cols when aggregated by self.strata

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.

        Returns
        -------
        image_df : Iterator[pandas.core.frame.DataFrame]
            A generator whose __next__() call returns a chunk of the compartment
            table, where rows comprising a unique aggregation stratum are not split
            between chunks, and thus groupby aggregations are valid

        """
        assert (  # noqa: S101
            n_aggregation_memory_strata > 0
        ), "Number of strata to pull into memory at once (n_aggregation_memory_strata) must be > 0"

        # Obtain data types of all columns of the compartment table
        cols = "*"
        compartment_row1 = pd.read_sql(
            sql=f"select {cols} from {compartment} limit 1",
            con=self.conn,
        )
        all_columns = compartment_row1.columns
        if self.features != "infer":  # allow to get only some features
            all_columns = [x for x in all_columns if x in self.features]

        typeof_str = ", ".join([f"typeof({x})" for x in all_columns])
        compartment_dtypes = pd.read_sql(
            sql=f"select {typeof_str} from {compartment} limit 1",
            con=self.conn,
        )
        # Strip the characters "typeof(" from the beginning and ")" from the end of
        # compartment column names returned by SQLite
        strip_typeof = lambda s: s[7:-1]
        dtype_dict = dict(
            zip(
                [strip_typeof(s) for s in compartment_dtypes.columns],  # column names
                compartment_dtypes.iloc[0].values,  # corresponding data types
            )
        )

        # Obtain all valid strata combinations, and their merge_cols values
        df_unique_mergecols = (
            self.image_df[self.strata + self.merge_cols]
            .groupby(self.strata)
            .agg(lambda s: np.unique(s).tolist())
            .reset_index(drop=True)
        )

        # Group the unique strata values into a list of SQLite condition strings
        # Find unique aggregated strata for the output
        strata_conditions = _sqlite_strata_conditions(
            df=df_unique_mergecols,
            dtypes=dtype_dict,
            n=n_aggregation_memory_strata,
        )

        # The generator, for each group of compartment values
        for strata_condition in strata_conditions:
            specific_compartment_query = (
                f"select {cols} from {compartment} where {strata_condition}"
            )
            image_df_chunk = pd.read_sql(sql=specific_compartment_query, con=self.conn)
            yield image_df_chunk

    def merge_single_cells(
        self,
        compute_subsample: bool = False,
        sc_output_file: Optional[str] = None,
        compression_options: Optional[str] = None,
        float_format: Optional[str] = None,
        single_cell_normalize: bool = False,
        normalize_args: Optional[Dict] = None,
        platemap: Optional[Union[str, pd.DataFrame]] = None,
        **kwargs,
    ):
        """Given the linking columns, merge single cell data. Normalization is also supported.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample.
        sc_output_file : str, optional
            The name of a file to output.
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        single_cell_normalize : bool, default False
            Whether or not to normalize the single cell data.
        normalize_args : dict, optional
            Additional arguments passed as input to pycytominer.normalize().
        platemap: str or pd.DataFrame, default None
            optional platemap filepath str or pd.DataFrame to be used with results via annotate

        Returns
        -------
        pandas.core.frame.DataFrame or str
            if output_file=None returns a Pandas dataframe
            else will write to file and return the filepath of the file
        """
        # Load the single cell dataframe by merging on the specific linking columns
        sc_df = ""
        linking_check_cols = []
        merge_suffix_rename = []
        for left_compartment in self.compartment_linking_cols:
            for right_compartment in self.compartment_linking_cols[left_compartment]:
                # Make sure only one merge per combination occurs
                linking_check = "-".join(sorted([left_compartment, right_compartment]))
                if linking_check in linking_check_cols:
                    continue

                # Specify how to indicate merge suffixes
                merge_suffix = [
                    f"_{left_compartment}",
                    f"_{right_compartment}",
                ]
                merge_suffix_rename += merge_suffix
                left_link_col = self.compartment_linking_cols[left_compartment][
                    right_compartment
                ]
                right_link_col = self.compartment_linking_cols[right_compartment][
                    left_compartment
                ]

                if isinstance(sc_df, str):
                    sc_df = self.load_compartment(compartment=left_compartment)

                    if compute_subsample:
                        # Sample cells proportionally by self.strata
                        self.get_subsample(df=sc_df, rename_col=False)

                        subset_logic_df = self.subset_data_df.drop(
                            self.image_df.columns, axis="columns"
                        )

                        sc_df = subset_logic_df.merge(
                            sc_df, how="left", on=subset_logic_df.columns.tolist()
                        ).reindex(sc_df.columns, axis="columns")

                sc_df = sc_df.merge(
                    self.load_compartment(compartment=right_compartment),
                    left_on=[*self.merge_cols, left_link_col],
                    right_on=[*self.merge_cols, right_link_col],
                    suffixes=merge_suffix,
                )

                linking_check_cols.append(linking_check)

        # Add metadata prefix to merged suffixes
        full_merge_suffix_rename = []
        full_merge_suffix_original = []
        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            full_merge_suffix_original.append(col_name)
            full_merge_suffix_rename.append(f"Metadata_{col_name}")

        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            for suffix in set(merge_suffix_rename):
                full_merge_suffix_original.append(f"{col_name}{suffix}")
                full_merge_suffix_rename.append(f"Metadata_{col_name}{suffix}")

        self.full_merge_suffix_rename = dict(
            zip(full_merge_suffix_original, full_merge_suffix_rename)
        )

        # Add image data to single cell dataframe
        if not self.image_data_loaded:
            self.load_image(image_table_name=self.image_table_name)

        sc_df = (
            self.image_df.merge(sc_df, on=self.merge_cols, how="right")
            # pandas rename performance may be improved using copy=False, inplace=False
            # reference: https://ryanlstevens.github.io/2022-05-06-pandasColumnRenaming/
            .rename(self.linking_col_rename, axis="columns", copy=False, inplace=False)
            .rename(
                self.full_merge_suffix_rename, axis="columns", copy=False, inplace=False
            )
        )
        if single_cell_normalize:
            # Infering features is tricky with non-canonical data
            if normalize_args is None:
                normalize_args = {}
                features = infer_cp_features(sc_df, compartments=self.compartments)
            elif ("features" not in normalize_args) or (
                normalize_args["features"] == "infer"
            ):
                features = infer_cp_features(sc_df, compartments=self.compartments)
            else:
                features = normalize_args["features"]

            normalize_args["features"] = features

            sc_df = normalize(profiles=sc_df, **normalize_args)

        # In case platemap metadata is provided, use pycytominer.annotate for metadata
        if platemap is not None:
            sc_df = annotate(
                profiles=sc_df, platemap=platemap, output_file=None, **kwargs
            )

        # if output argument is provided, call it using df_merged_sc and kwargs
        if sc_output_file is not None:
            return output(
                df=sc_df,
                output_filename=sc_output_file,
                compression_options=compression_options,
                float_format=float_format,
                **kwargs,
            )
        else:
            return sc_df

    def aggregate_profiles(
        self,
        compute_subsample=False,
        output_file=None,
        compression_options=None,
        float_format=None,
        n_aggregation_memory_strata=1,
        **kwargs,
    ):
        """Aggregate and merge compartments. This is the primary entry to this class.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample. compute_subsample must be specified to perform subsampling.
            The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.
        output_file : str, optional
            The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.

        Returns
        -------
        pandas.core.frame.DataFrame or str
            if output_file=None) returns a Pandas dataframe
            else will write to file and return the filepath of the file
        """
        if output_file is not None:
            self.set_output_file(output_file)

        for compartment_idx, compartment in enumerate(self.compartments):
            if compartment_idx == 0:
                aggregated = self.aggregate_compartment(
                    compartment=compartment,
                    compute_subsample=compute_subsample,
                    compute_counts=True,
                    add_image_features=self.add_image_features,
                    n_aggregation_memory_strata=n_aggregation_memory_strata,
                )
            else:
                aggregated = aggregated.merge(
                    self.aggregate_compartment(
                        compartment=compartment,
                        n_aggregation_memory_strata=n_aggregation_memory_strata,
                    ),
                    on=self.strata,
                    how="inner",
                )

        self.is_aggregated = True

        if self.output_file is not None:
            return output(
                df=aggregated,
                output_filename=self.output_file,
                compression_options=compression_options,
                float_format=float_format,
                **kwargs,
            )
        else:
            return aggregated

init(sql_file, strata=['Metadata_Plate', 'Metadata_Well'], aggregation_operation='median', output_file=None, compartments=default_compartments, compartment_linking_cols=default_linking_cols, merge_cols=['TableNumber', 'ImageNumber'], image_cols=['TableNumber', 'ImageNumber', 'Metadata_Site'], add_image_features=False, image_feature_categories=None, features='infer', load_image_data=True, image_table_name='image', subsample_frac=1, subsample_n='all', subsampling_random_state=None, fields_of_view='all', fields_of_view_feature='Metadata_Site', object_feature='Metadata_ObjectNumber', default_datatype_float=np.float64) ¶

Construct a SingleCells object.

Source code in pycytominer/cyto_utils/cells.py

def __init__(
    self,
    sql_file,
    strata=["Metadata_Plate", "Metadata_Well"],
    aggregation_operation="median",
    output_file=None,
    compartments=default_compartments,
    compartment_linking_cols=default_linking_cols,
    merge_cols=["TableNumber", "ImageNumber"],
    image_cols=["TableNumber", "ImageNumber", "Metadata_Site"],
    add_image_features=False,
    image_feature_categories=None,
    features="infer",
    load_image_data=True,
    image_table_name="image",
    subsample_frac=1,
    subsample_n="all",
    subsampling_random_state=None,
    fields_of_view="all",
    fields_of_view_feature="Metadata_Site",
    object_feature="Metadata_ObjectNumber",
    default_datatype_float=np.float64,
):
    """Construct a SingleCells object."""
    # Check compartments specified
    check_compartments(compartments)

    # Check if correct operation is specified
    aggregation_operation = check_aggregate_operation(aggregation_operation)

    # Check that the subsample_frac is between 0 and 1
    assert (  # noqa: S101
        subsample_frac > 0 and subsample_frac <= 1
    ), "subsample_frac must be between 0 and 1"

    self.sql_file = sql_file
    self.strata = strata
    self.load_image_data = load_image_data
    self.image_table_name = image_table_name
    self.aggregation_operation = aggregation_operation.lower()
    self.output_file = output_file
    self.merge_cols = merge_cols
    self.image_cols = image_cols
    self.add_image_features = add_image_features
    self.image_feature_categories = image_feature_categories
    self.features = features
    self.subsample_frac = subsample_frac
    self.subsample_n = subsample_n
    self.subset_data_df = None
    self.subsampling_random_state = subsampling_random_state
    self.is_aggregated = False
    self.is_subset_computed = False
    self.compartments = compartments
    self.compartment_linking_cols = compartment_linking_cols
    self.fields_of_view_feature = fields_of_view_feature
    self.object_feature = object_feature
    self.default_datatype_float = default_datatype_float

    # Confirm that the compartments and linking cols are formatted properly
    assert_linking_cols_complete(
        compartments=self.compartments, linking_cols=self.compartment_linking_cols
    )

    # Build a dictionary to update linking column feature names
    self.linking_col_rename = provide_linking_cols_feature_name_update(
        self.compartment_linking_cols
    )

    if self.subsample_n != "all":
        self.set_subsample_n(self.subsample_n)

    # Connect to sqlite engine
    self.engine = create_engine(self.sql_file)
    self.conn = self.engine.connect()

    # Throw an error if both subsample_frac and subsample_n is set
    self._check_subsampling()

    # Confirm that the input fields of view is valid
    self.fields_of_view = check_fields_of_view_format(fields_of_view)

    # attribute to track image table data load status
    self.image_data_loaded = False
    if self.load_image_data:
        self.load_image(image_table_name=self.image_table_name)

`aggregate_compartment(compartment, compute_subsample=False, compute_counts=False, add_image_features=False, n_aggregation_memory_strata=1)` ¶

Aggregate morphological profiles. Uses pycytominer.aggregate().

Parameters:

Name	Type	Description	Default
`compartment`	`str`	Compartment to aggregate.	required
`compute_subsample`	`bool`	Whether or not to subsample.	`False`
`compute_counts`	`bool`	Whether or not to compute the number of objects in each compartment and the number of fields of view per well.	`False`
`add_image_features`	`bool`	Whether or not to add image features.	`False`
`n_aggregation_memory_strata`	`int`	Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory. For example, if aggregating by "well", then n_aggregation_memory_strata=1 means that one "well" will be pulled from the SQLite database into memory at a time.	`1`

Returns:

Type	Description
`DataFrame`	DataFrame of aggregated profiles.

Source code in pycytominer/cyto_utils/cells.py

def aggregate_compartment(
    self,
    compartment,
    compute_subsample=False,
    compute_counts=False,
    add_image_features=False,
    n_aggregation_memory_strata=1,
):
    """Aggregate morphological profiles. Uses pycytominer.aggregate().

    Parameters
    ----------
    compartment : str
        Compartment to aggregate.
    compute_subsample : bool, default False
        Whether or not to subsample.
    compute_counts : bool, default False
        Whether or not to compute the number of objects in each compartment
        and the number of fields of view per well.
    add_image_features : bool, default False
        Whether or not to add image features.
    n_aggregation_memory_strata : int, default 1
        Number of unique strata to pull from the database into working memory
        at once.  Typically 1 is fastest.  A larger number uses more memory.
        For example, if aggregating by "well", then n_aggregation_memory_strata=1
        means that one "well" will be pulled from the SQLite database into
        memory at a time.

    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of aggregated profiles.
    """
    check_compartments(compartment)

    if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
        self.get_subsample(compartment=compartment)

    # Load image data if not already loaded
    if not self.image_data_loaded:
        self.load_image(image_table_name=self.image_table_name)

    # Iteratively call aggregate() on chunks of the full compartment table
    object_dfs = []
    for compartment_df in self._compartment_df_generator(
        compartment=compartment,
        n_aggregation_memory_strata=n_aggregation_memory_strata,
    ):
        population_df = self.image_df.merge(
            compartment_df,
            how="inner",
            on=self.merge_cols,
        ).rename(self.linking_col_rename, axis="columns")

        if self.features == "infer":
            aggregate_features = infer_cp_features(
                population_df, compartments=compartment
            )
        else:
            aggregate_features = self.features

        partial_object_df = aggregate(
            population_df=population_df,
            strata=self.strata,
            compute_object_count=compute_counts,
            operation=self.aggregation_operation,
            subset_data_df=self.subset_data_df,
            features=aggregate_features,
            object_feature=self.object_feature,
        )

        if compute_counts and self.fields_of_view_feature not in self.strata:
            fields_count_df = aggregate_fields_count(
                self.image_df, self.strata, self.fields_of_view_feature
            )

            if add_image_features:
                fields_count_df = aggregate_image_features(
                    fields_count_df,
                    self.image_features_df,
                    self.image_feature_categories,
                    self.image_cols,
                    self.strata,
                    self.aggregation_operation,
                )

            partial_object_df = fields_count_df.merge(
                partial_object_df,
                on=self.strata,
                how="right",
            )

            # Separate all the metadata and feature columns.
            metadata_cols = infer_cp_features(partial_object_df, metadata=True)
            feature_cols = infer_cp_features(partial_object_df, image_features=True)

            partial_object_df = partial_object_df.reindex(
                columns=metadata_cols + feature_cols
            )

        object_dfs.append(partial_object_df)

    # Concatenate one or more aggregated dataframes row-wise into final output
    object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True)

    return object_df

`aggregate_profiles(compute_subsample=False, output_file=None, compression_options=None, float_format=None, n_aggregation_memory_strata=1, **kwargs)` ¶

Aggregate and merge compartments. This is the primary entry to this class.

Parameters:

Name	Type	Description	Default
`compute_subsample`	`bool`	Whether or not to compute subsample. compute_subsample must be specified to perform subsampling. The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.	`False`
`output_file`	`str`	The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".	`None`
`compression_options`	`str`	Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.	`None`
`float_format`	`str`	Decimal precision to use in writing output file.	`None`
`n_aggregation_memory_strata`	`int`	Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory.	`1`

Returns:

Type	Description
`DataFrame or str`	if output_file=None) returns a Pandas dataframe else will write to file and return the filepath of the file

Source code in pycytominer/cyto_utils/cells.py

def aggregate_profiles(
    self,
    compute_subsample=False,
    output_file=None,
    compression_options=None,
    float_format=None,
    n_aggregation_memory_strata=1,
    **kwargs,
):
    """Aggregate and merge compartments. This is the primary entry to this class.

    Parameters
    ----------
    compute_subsample : bool, default False
        Whether or not to compute subsample. compute_subsample must be specified to perform subsampling.
        The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.
    output_file : str, optional
        The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".
    compression_options : str, optional
        Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file.
    n_aggregation_memory_strata : int, default 1
        Number of unique strata to pull from the database into working memory
        at once.  Typically 1 is fastest.  A larger number uses more memory.

    Returns
    -------
    pandas.core.frame.DataFrame or str
        if output_file=None) returns a Pandas dataframe
        else will write to file and return the filepath of the file
    """
    if output_file is not None:
        self.set_output_file(output_file)

    for compartment_idx, compartment in enumerate(self.compartments):
        if compartment_idx == 0:
            aggregated = self.aggregate_compartment(
                compartment=compartment,
                compute_subsample=compute_subsample,
                compute_counts=True,
                add_image_features=self.add_image_features,
                n_aggregation_memory_strata=n_aggregation_memory_strata,
            )
        else:
            aggregated = aggregated.merge(
                self.aggregate_compartment(
                    compartment=compartment,
                    n_aggregation_memory_strata=n_aggregation_memory_strata,
                ),
                on=self.strata,
                how="inner",
            )

    self.is_aggregated = True

    if self.output_file is not None:
        return output(
            df=aggregated,
            output_filename=self.output_file,
            compression_options=compression_options,
            float_format=float_format,
            **kwargs,
        )
    else:
        return aggregated

`count_cells(compartment='cells', count_subset=False)` ¶

Determine how many cells are measured per well.

Parameters:

Name	Type	Description	Default
`compartment`	`str`	Compartment to subset.	`"cells"`
`count_subset`	`bool`	Whether or not count the number of cells as specified by the strata groups.	`False`

Returns:

Type	Description
`DataFrame`	DataFrame of cell counts in the experiment.

Source code in pycytominer/cyto_utils/cells.py

def count_cells(self, compartment="cells", count_subset=False):
    """Determine how many cells are measured per well.

    Parameters
    ----------
    compartment : str, default "cells"
        Compartment to subset.
    count_subset : bool, default False
        Whether or not count the number of cells as specified by the strata groups.

    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of cell counts in the experiment.
    """
    check_compartments(compartment)

    if count_subset:
        assert self.is_aggregated, "Make sure to aggregate_profiles() first!"  # noqa: S101
        assert self.is_subset_computed, "Make sure to get_subsample() first!"  # noqa: S101
        count_df = (
            self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"]
            .count()
            .reset_index()
            .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns")
        )
    else:
        query_cols = "TableNumber, ImageNumber, ObjectNumber"
        query = f"select {query_cols} from {compartment}"
        count_df = self.image_df.merge(
            pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
        )
        count_df = (
            count_df.groupby(self.strata)["ObjectNumber"]
            .count()
            .reset_index()
            .rename({"ObjectNumber": "cell_count"}, axis="columns")
        )

    return count_df

`count_sql_table_rows(table)` ¶

Count total number of rows for a table.

Source code in pycytominer/cyto_utils/cells.py

def count_sql_table_rows(self, table):
    """Count total number of rows for a table."""
    (num_rows,) = next(self.conn.execute(f"SELECT COUNT(*) FROM {table}"))
    return num_rows

`get_sql_table_col_names(table)` ¶

Get column names from the database.

Source code in pycytominer/cyto_utils/cells.py

def get_sql_table_col_names(self, table):
    """Get column names from the database."""
    ptr = self.conn.execute(f"SELECT * FROM {table} LIMIT 1").cursor
    col_names = [obj[0] for obj in ptr.description]

    return col_names

`get_subsample(df=None, compartment='cells', rename_col=True)` ¶

Apply the subsampling procedure.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame of a single cell profile.	`None`
`compartment`	`str`	The compartment to process.	`"cells"`
`rename_col`	`bool`	Whether or not to rename the columns.	`True`

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py

def get_subsample(self, df=None, compartment="cells", rename_col=True):
    """Apply the subsampling procedure.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame of a single cell profile.
    compartment : str, default "cells"
        The compartment to process.
    rename_col : bool, default True
        Whether or not to rename the columns.

    Returns
    -------
    None
        Nothing is returned.
    """
    check_compartments(compartment)

    query_cols = "TableNumber, ImageNumber, ObjectNumber"
    query = f"select {query_cols} from {compartment}"

    # Load query and merge with image_df
    if df is None:
        df = pd.read_sql(sql=query, con=self.conn)

    query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)

    self.subset_data_df = (
        query_df.groupby(self.strata)
        .apply(lambda x: self.subsample_profiles(x, rename_col=rename_col))
        .reset_index(drop=True)
    )

    self.is_subset_computed = True

`load_compartment(compartment)` ¶

Create the compartment dataframe.

Note: makes use of default_datatype_float attribute for setting a default floating point datatype.

Parameters:

Name	Type	Description	Default
`compartment`	`str`	The compartment to process.	required

Returns:

Type	Description
`DataFrame`	Compartment dataframe.

Source code in pycytominer/cyto_utils/cells.py

def load_compartment(self, compartment):
    """Create the compartment dataframe.

    Note: makes use of default_datatype_float attribute
    for setting a default floating point datatype.

    Parameters
    ----------
    compartment : str
        The compartment to process.

    Returns
    -------
    pandas.core.frame.DataFrame
        Compartment dataframe.
    """
    # Get data useful to pre-alloc memory
    num_cells = self.count_sql_table_rows(compartment)
    col_names = self.get_sql_table_col_names(compartment)
    if self.features != "infer":  # allow to get only some features
        col_names = [x for x in col_names if x in self.features]
    meta_cols, feat_cols = self.split_column_categories(col_names)
    num_meta, num_feats = len(meta_cols), len(feat_cols)

    # Use pre-allocated np.array for feature data
    feats = np.empty(
        shape=(num_cells, num_feats), dtype=self.default_datatype_float
    )
    # Use pre-allocated pd.DataFrame for metadata
    metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

    # Query database for selected columns of chosen compartment
    columns = ", ".join(meta_cols + feat_cols)
    query = f"select {columns} from {compartment}"
    query_result = self.conn.execute(query)

    # Load data row by row for both meta information and features
    for i, row in enumerate(query_result):
        metas.loc[i] = row[:num_meta]
        feats[i] = row[num_meta:]

    # Return concatenated data and metainformation of compartment
    return pd.concat([metas, pd.DataFrame(columns=feat_cols, data=feats)], axis=1)

`load_image(image_table_name=None)` ¶

Load image table from sqlite file.

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py

def load_image(self, image_table_name=None):
    """Load image table from sqlite file.

    Returns
    -------
    None
        Nothing is returned.
    """
    if image_table_name is None:
        image_table_name = self.image_table_name

    image_query = f"select * from {image_table_name}"
    self.image_df = pd.read_sql(sql=image_query, con=self.conn)

    if self.add_image_features:
        self.image_features_df = extract_image_features(
            self.image_feature_categories,
            self.image_df,
            self.image_cols,
            self.strata,
        )

    image_features = list(np.union1d(self.image_cols, self.strata))
    self.image_df = self.image_df[image_features]

    if self.fields_of_view != "all":
        check_fields_of_view(
            list(np.unique(self.image_df[self.fields_of_view_feature])),
            list(self.fields_of_view),
        )
        self.image_df = self.image_df.query(
            f"{self.fields_of_view_feature}==@self.fields_of_view"
        )

        if self.add_image_features:
            self.image_features_df = self.image_features_df.query(
                f"{self.fields_of_view_feature}==@self.fields_of_view"
            )

    self.image_data_loaded = True

`merge_single_cells(compute_subsample=False, sc_output_file=None, compression_options=None, float_format=None, single_cell_normalize=False, normalize_args=None, platemap=None, **kwargs)` ¶

Given the linking columns, merge single cell data. Normalization is also supported.

Parameters:

Name	Type	Description	Default
`compute_subsample`	`bool`	Whether or not to compute subsample.	`False`
`sc_output_file`	`str`	The name of a file to output.	`None`
`compression_options`	`str`	Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.	`None`
`float_format`	`str`	Decimal precision to use in writing output file.	`None`
`single_cell_normalize`	`bool`	Whether or not to normalize the single cell data.	`False`
`normalize_args`	`dict`	Additional arguments passed as input to pycytominer.normalize().	`None`
`platemap`	`Optional[Union[str, DataFrame]]`	optional platemap filepath str or pd.DataFrame to be used with results via annotate	`None`

Returns:

Type	Description
`DataFrame or str`	if output_file=None returns a Pandas dataframe else will write to file and return the filepath of the file

Source code in pycytominer/cyto_utils/cells.py

def merge_single_cells(
    self,
    compute_subsample: bool = False,
    sc_output_file: Optional[str] = None,
    compression_options: Optional[str] = None,
    float_format: Optional[str] = None,
    single_cell_normalize: bool = False,
    normalize_args: Optional[Dict] = None,
    platemap: Optional[Union[str, pd.DataFrame]] = None,
    **kwargs,
):
    """Given the linking columns, merge single cell data. Normalization is also supported.

    Parameters
    ----------
    compute_subsample : bool, default False
        Whether or not to compute subsample.
    sc_output_file : str, optional
        The name of a file to output.
    compression_options : str, optional
        Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file.
    single_cell_normalize : bool, default False
        Whether or not to normalize the single cell data.
    normalize_args : dict, optional
        Additional arguments passed as input to pycytominer.normalize().
    platemap: str or pd.DataFrame, default None
        optional platemap filepath str or pd.DataFrame to be used with results via annotate

    Returns
    -------
    pandas.core.frame.DataFrame or str
        if output_file=None returns a Pandas dataframe
        else will write to file and return the filepath of the file
    """
    # Load the single cell dataframe by merging on the specific linking columns
    sc_df = ""
    linking_check_cols = []
    merge_suffix_rename = []
    for left_compartment in self.compartment_linking_cols:
        for right_compartment in self.compartment_linking_cols[left_compartment]:
            # Make sure only one merge per combination occurs
            linking_check = "-".join(sorted([left_compartment, right_compartment]))
            if linking_check in linking_check_cols:
                continue

            # Specify how to indicate merge suffixes
            merge_suffix = [
                f"_{left_compartment}",
                f"_{right_compartment}",
            ]
            merge_suffix_rename += merge_suffix
            left_link_col = self.compartment_linking_cols[left_compartment][
                right_compartment
            ]
            right_link_col = self.compartment_linking_cols[right_compartment][
                left_compartment
            ]

            if isinstance(sc_df, str):
                sc_df = self.load_compartment(compartment=left_compartment)

                if compute_subsample:
                    # Sample cells proportionally by self.strata
                    self.get_subsample(df=sc_df, rename_col=False)

                    subset_logic_df = self.subset_data_df.drop(
                        self.image_df.columns, axis="columns"
                    )

                    sc_df = subset_logic_df.merge(
                        sc_df, how="left", on=subset_logic_df.columns.tolist()
                    ).reindex(sc_df.columns, axis="columns")

            sc_df = sc_df.merge(
                self.load_compartment(compartment=right_compartment),
                left_on=[*self.merge_cols, left_link_col],
                right_on=[*self.merge_cols, right_link_col],
                suffixes=merge_suffix,
            )

            linking_check_cols.append(linking_check)

    # Add metadata prefix to merged suffixes
    full_merge_suffix_rename = []
    full_merge_suffix_original = []
    for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
        full_merge_suffix_original.append(col_name)
        full_merge_suffix_rename.append(f"Metadata_{col_name}")

    for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
        for suffix in set(merge_suffix_rename):
            full_merge_suffix_original.append(f"{col_name}{suffix}")
            full_merge_suffix_rename.append(f"Metadata_{col_name}{suffix}")

    self.full_merge_suffix_rename = dict(
        zip(full_merge_suffix_original, full_merge_suffix_rename)
    )

    # Add image data to single cell dataframe
    if not self.image_data_loaded:
        self.load_image(image_table_name=self.image_table_name)

    sc_df = (
        self.image_df.merge(sc_df, on=self.merge_cols, how="right")
        # pandas rename performance may be improved using copy=False, inplace=False
        # reference: https://ryanlstevens.github.io/2022-05-06-pandasColumnRenaming/
        .rename(self.linking_col_rename, axis="columns", copy=False, inplace=False)
        .rename(
            self.full_merge_suffix_rename, axis="columns", copy=False, inplace=False
        )
    )
    if single_cell_normalize:
        # Infering features is tricky with non-canonical data
        if normalize_args is None:
            normalize_args = {}
            features = infer_cp_features(sc_df, compartments=self.compartments)
        elif ("features" not in normalize_args) or (
            normalize_args["features"] == "infer"
        ):
            features = infer_cp_features(sc_df, compartments=self.compartments)
        else:
            features = normalize_args["features"]

        normalize_args["features"] = features

        sc_df = normalize(profiles=sc_df, **normalize_args)

    # In case platemap metadata is provided, use pycytominer.annotate for metadata
    if platemap is not None:
        sc_df = annotate(
            profiles=sc_df, platemap=platemap, output_file=None, **kwargs
        )

    # if output argument is provided, call it using df_merged_sc and kwargs
    if sc_output_file is not None:
        return output(
            df=sc_df,
            output_filename=sc_output_file,
            compression_options=compression_options,
            float_format=float_format,
            **kwargs,
        )
    else:
        return sc_df

`set_output_file(output_file)` ¶

Set or modify output file.

Parameters:

Name	Type	Description	Default
`output_file`	`str`	New output file name.	required

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py

def set_output_file(self, output_file):
    """Set or modify output file.

    Parameters
    ----------
    output_file : str
        New output file name.

    Returns
    -------
    None
        Nothing is returned.
    """
    self.output_file = output_file

`set_subsample_frac(subsample_frac)` ¶

Set or update the subsample fraction.

Parameters:

Name	Type	Description	Default
`subsample_frac`	`float`	Percentage of single cells to select (0 < subsample_frac <= 1).	`1`

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py

def set_subsample_frac(self, subsample_frac):
    """Set or update the subsample fraction.

    Parameters
    ----------
    subsample_frac : float, default 1
        Percentage of single cells to select (0 < subsample_frac <= 1).

    Returns
    -------
    None
        Nothing is returned.
    """
    self.subsample_frac = subsample_frac
    self._check_subsampling()

`set_subsample_n(subsample_n)` ¶

Set or update the subsample n.

Parameters:

Name	Type	Description	Default
`subsample_n`	`int`	Indicate how many sample to subsample - do not specify both subsample_frac and subsample_n.	`"all"`

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py

def set_subsample_n(self, subsample_n):
    """Set or update the subsample n.

    Parameters
    ----------
    subsample_n : int, default "all"
        Indicate how many sample to subsample - do not specify both subsample_frac and subsample_n.

    Returns
    -------
    None
        Nothing is returned.
    """
    try:
        self.subsample_n = int(subsample_n)
    except ValueError:
        raise ValueError("subsample n must be an integer or coercable")
    self._check_subsampling()

`set_subsample_random_state(random_state)` ¶

Set or update the subsample random state.

Parameters:

Name	Type	Description	Default
`random_state`		The random state to init subsample.	required

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py

def set_subsample_random_state(self, random_state):
    """Set or update the subsample random state.

    Parameters
    ----------
    random_state: int, optional
        The random state to init subsample.

    Returns
    -------
    None
        Nothing is returned.
    """
    self.subsampling_random_state = random_state

`split_column_categories(col_names)` ¶

Split a list of column names into feature and metadata columns lists.

Source code in pycytominer/cyto_utils/cells.py

def split_column_categories(self, col_names):
    """Split a list of column names into feature and metadata columns lists."""
    feat_cols = []
    meta_cols = []
    for col in col_names:
        if col.lower().startswith(tuple(self.compartments)):
            feat_cols.append(col)
        else:
            meta_cols.append(col)

    return meta_cols, feat_cols

`subsample_profiles(df, rename_col=True)` ¶

Sample a Pandas DataFrame given subsampling information.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame of a single cell profile.	required
`rename_col`	`bool`	Whether or not to rename the columns.	`True`

Returns:

Type	Description
`DataFrame`	A subsampled pandas dataframe of single cell profiles.

Source code in pycytominer/cyto_utils/cells.py

def subsample_profiles(self, df, rename_col=True):
    """Sample a Pandas DataFrame given subsampling information.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame of a single cell profile.
    rename_col : bool, default True
        Whether or not to rename the columns.

    Returns
    -------
    pandas.core.frame.DataFrame
        A subsampled pandas dataframe of single cell profiles.
    """
    if self.subsampling_random_state is None:
        random_state = np.random.randint(0, 10000, size=1)[0]
        self.set_subsample_random_state(random_state)

    if self.subsample_frac == 1:
        output_df = pd.DataFrame.sample(
            df,
            n=self.subsample_n,
            replace=True,
            random_state=self.subsampling_random_state,
        )
    else:
        output_df = pd.DataFrame.sample(
            df, frac=self.subsample_frac, random_state=self.subsampling_random_state
        )

    if rename_col:
        output_df = output_df.rename(self.linking_col_rename, axis="columns")

    return output_df

`pycytominer.cyto_utils.collate` ¶

Module that provides functions for collating CellProfiler-created CSVs into a single SQLite file.

`collate(batch, config, plate, base_directory='../..', column=None, munge=False, csv_dir='analysis', aws_remote=None, aggregate_only=False, tmp_dir='/tmp', overwrite=False, add_image_features=True, image_feature_categories=['Granularity', 'Texture', 'ImageQuality', 'Threshold'], printtoscreen=True)` ¶

Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database.

Parameters:

Name	Type	Description	Default
`batch`	`str`	Batch name to process	required
`config`	`str`	Config file to pass to cytominer-database	required
`plate`	`str`	Plate name to process	required
`base_directory`	`str`	Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory	`"../.."`
`column`	`str`	An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists	`None`
`munge`	`bool`	Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table	`False`
`csv_dir`	`str`	The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"	`'analysis'`
`aws_remote`	`str`	A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run	`None`
`aggregate_only`	`bool`	Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps	`False`
`tmp_dir`		The temporary directory to be used by cytominer-databases for output	`'/tmp'`
`overwrite`		Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists	`False`
`add_image_features`		Whether or not to add the image features to the profiles	`True`
`image_feature_categories`		The list of image feature groups to be used by add_image_features during aggregation	`['Granularity', 'Texture', 'ImageQuality', 'Threshold']`
`printtoscreen`		Whether or not to print output to the terminal	`True`

Source code in pycytominer/cyto_utils/collate.py

def collate(
    batch,
    config,
    plate,
    base_directory="../..",
    column=None,
    munge=False,
    csv_dir="analysis",
    aws_remote=None,
    aggregate_only=False,
    tmp_dir="/tmp",  # noqa: S108
    overwrite=False,
    add_image_features=True,
    image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"],
    printtoscreen=True,
):
    """Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database.

    Parameters
    ----------
    batch : str
        Batch name to process
    config : str
        Config file to pass to cytominer-database
    plate : str
        Plate name to process
    base_directory : str, default "../.."
        Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory
    column : str, optional, default None
        An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists
    munge : bool, default False
        Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table
    csv_dir : str, default 'analysis'
        The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"
    aws_remote : str, optional, default None
        A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
    aggregate_only : bool, default False
        Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps
    tmp_dir: str, default '/tmp'
        The temporary directory to be used by cytominer-databases for output
    overwrite: bool, optional, default False
        Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists
    add_image_features: bool, optional, default True
        Whether or not to add the image features to the profiles
    image_feature_categories: list, optional, default ['Granularity','Texture','ImageQuality','Count','Threshold']
        The list of image feature groups to be used by add_image_features during aggregation
    printtoscreen: bool, optional, default True
        Whether or not to print output to the terminal
    """
    from pycytominer.cyto_utils.cells import SingleCells

    # Check if optional dependency cytominer-database is installed
    try:
        import cytominer_database.ingest
        import cytominer_database.munge
    except ImportError:
        raise ImportError(
            """Optional dependency cytominer-database is not installed.
            Please install the `collate` optional dependency group: e.g. `pip install pycytominer[collate]`
            """
        )

    # Set up directories (these need to be abspaths to keep from confusing makedirs later)
    input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}")
    backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}")
    cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}")

    aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv")
    backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite")
    cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite")

    if not aggregate_only:
        if os.path.exists(cache_backend_file):
            if not overwrite:
                sys.exit(
                    f"An SQLite file for {plate} already exists at {cache_backend_file} and overwrite is set to False. Terminating."
                )
            else:
                os.remove(cache_backend_file)

        for eachdir in [input_dir, backend_dir, cache_backend_dir]:
            if not os.path.exists(eachdir):
                os.makedirs(eachdir, exist_ok=True)

        if aws_remote:
            remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}"

            remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"

            remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

            sync_cmd = f"aws s3 sync --exclude * --include */Cells.csv --include */Nuclei.csv --include */Cytoplasm.csv --include */Image.csv {remote_input_dir} {input_dir}"
            if printtoscreen:
                print(f"Downloading CSVs from {remote_input_dir} to {input_dir}")
            run_check_errors(sync_cmd)

        if printtoscreen:
            print(f"Ingesting {input_dir}")
        # Run cytominer-database ingest
        if munge:
            cytominer_database.munge.munge(config_path=config, source=input_dir)

        cytominer_database.ingest.seed(
            source=input_dir,
            target=f"sqlite:///{cache_backend_file}",
            config_file=config,
        )

        # Create a sqlite3 connection
        with sqlite3.connect(cache_backend_file, isolation_level=None) as connection:
            cursor = connection.cursor()
            if column:
                if print:
                    print(f"Adding a Metadata_Plate column based on column {column}")
                cursor.execute("ALTER TABLE Image ADD COLUMN Metadata_Plate TEXT;")
                cursor.execute(f"UPDATE image SET Metadata_Plate ={column};")

            if printtoscreen:
                print(f"Indexing database {cache_backend_file}")
            cursor.execute(
                "CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);"
            )
            for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]:
                cursor.execute(
                    f"""CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx
                                ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);"""
                )
            cursor.execute(
                "CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);"
            )
            cursor.close()
        connection.close()

        if aws_remote:
            if printtoscreen:
                print(f"Uploading {cache_backend_file} to {remote_backend_file}")
            cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file]
            run_check_errors(cp_cmd)

            if printtoscreen:
                print(
                    f"Removing analysis files from {input_dir} and {cache_backend_dir}"
                )
            import shutil

            shutil.rmtree(input_dir)

        if printtoscreen:
            print(f"Renaming {cache_backend_file} to {backend_file}")
        os.rename(cache_backend_file, backend_file)

    if printtoscreen:
        print(f"Aggregating sqlite:///{backend_file}")

    if aggregate_only and aws_remote:
        remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"

        remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

        cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file]
        if printtoscreen:
            print(
                f"Downloading SQLite files from {remote_backend_file} to {backend_file}"
            )
        run_check_errors(cp_cmd)

    if not os.path.exists(backend_file):
        sys.exit(f"{backend_file} does not exist. Exiting.")

    if add_image_features:
        pass
    else:
        image_feature_categories = None  # defensive but not sure what will happen if we give a list but set to False

    database = SingleCells(
        f"sqlite:///{backend_file}",
        aggregation_operation="mean",
        add_image_features=add_image_features,
        image_feature_categories=image_feature_categories,
    )
    database.aggregate_profiles(output_file=aggregated_file)

    if aws_remote:
        if printtoscreen:
            print(f"Uploading {aggregated_file} to {remote_aggregated_file}")
        csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file]
        run_check_errors(csv_cp_cmd)

        if printtoscreen:
            print(f"Removing backend files from {backend_dir}")
        import shutil

        shutil.rmtree(backend_dir)

`run_check_errors(cmd)` ¶

Run a system command, and exit if an error occurred, otherwise continue.

Source code in pycytominer/cyto_utils/collate.py

def run_check_errors(cmd):
    """Run a system command, and exit if an error occurred, otherwise continue."""
    if isinstance(cmd, str):
        cmd = cmd.split()
    output = subprocess.run(cmd, capture_output=True, text=True)  # noqa: S603
    if output.stderr != "":
        print_cmd = " ".join(map(str, cmd))
        sys.exit(
            f"The error {output.stderr} was generated when running {print_cmd}. Exiting."
        )
    return

`pycytominer.cyto_utils.collate_cmd` ¶

Command line interface for collate function in pycytominer.cyto_utils.collate.

`pycytominer.cyto_utils.cp_image_features` ¶

Functions for counting the number of fields and aggregating other images features.

`aggregate_fields_count(image_df, strata, fields_of_view_feature)` ¶

Compute the number of fields per well and create a new column called Metadata_Site_Count.

Parameters:

Name	Type	Description	Default
`image_df`	`DataFrame`	Image table dataframe which includes the strata and fields of view feature as columns.	required
`strata`	`list of str`	The columns to groupby and aggregate single cells.	required
`fields_of_view_feature`		Name of the fields of the view column.	required

Returns:

Name	Type	Description
`fields_count_df`	`DataFrame`	DataFrame with the Metadata_Site_Count column.

Source code in pycytominer/cyto_utils/cp_image_features.py

def aggregate_fields_count(image_df, strata, fields_of_view_feature):
    """Compute the number of fields per well and create a new column called Metadata_Site_Count.

    Parameters
    ----------
    image_df : pandas.core.frame.DataFrame
        Image table dataframe which includes the strata and fields of view feature as columns.
    strata :  list of str
        The columns to groupby and aggregate single cells.
    fields_of_view_feature: str
        Name of the fields of the view column.

    Returns
    -------
    fields_count_df: pandas.core.frame.DataFrame
        DataFrame with the Metadata_Site_Count column.

    """
    fields_count_df = image_df.loc[:, list(np.union1d(strata, fields_of_view_feature))]

    fields_count_df = (
        fields_count_df.groupby(strata)[fields_of_view_feature]
        .count()
        .reset_index()
        .rename(columns={f"{fields_of_view_feature}": "Metadata_Site_Count"})
    )

    return fields_count_df

`aggregate_image_count_features(df, image_features_df, image_cols, strata, count_prefix='Count')` ¶

Aggregate the Count features in the Image table.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe of aggregated profiles.	required
`image_features_df`	`DataFrame`	Image table dataframe with Count features	required
`image_cols`	`list of str`	Columns to select from the image table.	required
`strata`	`list of str`	The columns to groupby and aggregate single cells.	required
`count_prefix`	`str`	Prefix of the count columns in the image table.	`"Count"`

Returns:

Name	Type	Description
`df`	`DataFrame`	DataFrame with aggregated Count features in the Image table.
`remove_cols`	`list of str`	Columns to remove from the image table before aggregating using aggregate_image_features()

Source code in pycytominer/cyto_utils/cp_image_features.py

def aggregate_image_count_features(
    df, image_features_df, image_cols, strata, count_prefix="Count"
):
    """Aggregate the Count features in the Image table.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        Dataframe of aggregated profiles.
    image_features_df : pandas.core.frame.DataFrame
        Image table dataframe with Count features
    image_cols : list of str
        Columns to select from the image table.
    strata :  list of str
        The columns to groupby and aggregate single cells.
    count_prefix : str, default "Count"
        Prefix of the count columns in the image table.

    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame with aggregated Count features in the Image table.
    remove_cols : list of str
        Columns to remove from the image table before aggregating using aggregate_image_features()
    """
    count_features = list(
        image_features_df.columns[
            image_features_df.columns.str.startswith("Metadata_" + str(count_prefix))
        ]
    )

    remove_cols = list(np.union1d(image_cols, count_features))
    keep_cols = list(np.union1d(strata, count_features))
    count_df = image_features_df[keep_cols].copy()
    count_df = count_df.groupby(strata, dropna=False).sum().reset_index()
    df = df.merge(count_df, on=strata, how="left")

    return df, remove_cols

`aggregate_image_features(df, image_features_df, image_feature_categories, image_cols, strata, aggregation_operation, count_prefix='Count')` ¶

Aggregate the non-Count image features.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe of aggregated profiles.	required
`image_features_df`	`DataFrame`	Image table dataframe with all the image_feature_category features.	required
`image_feature_categories`	`list of str`	List of categories of features from the image table to add to the profiles.	required
`image_cols`	`list of str`	Columns to select from the image table.	required
`strata`	`list of str`	The columns to groupby and aggregate single cells.	required
`aggregation_operation`	`str`	Operation to perform image table feature aggregation.	required
`count_prefix`	`str`	Prefix of the count columns in the image table.	`"Count"`

Returns:

Name	Type	Description
`df`	`DataFrame`	DataFrame of aggregated image features.

Source code in pycytominer/cyto_utils/cp_image_features.py

def aggregate_image_features(
    df,
    image_features_df,
    image_feature_categories,
    image_cols,
    strata,
    aggregation_operation,
    count_prefix="Count",
):
    """Aggregate the non-Count image features.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        Dataframe of aggregated profiles.
    image_features_df : pandas.core.frame.DataFrame
        Image table dataframe with all the image_feature_category features.
    image_feature_categories : list of str
        List of categories of features from the image table to add to the profiles.
    image_cols : list of str
        Columns to select from the image table.
    strata :  list of str
        The columns to groupby and aggregate single cells.
    aggregation_operation : str
        Operation to perform image table feature aggregation.
    count_prefix : str, default "Count"
        Prefix of the count columns in the image table.

    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame of aggregated image features.

    """
    # Aggregate image count features
    if count_prefix in image_feature_categories:
        df, remove_cols = aggregate_image_count_features(
            df, image_features_df, image_cols, strata
        )
    else:
        remove_cols = list(image_cols) + list(
            image_features_df.columns[
                image_features_df.columns.str.startswith(f"Metadata_{count_prefix}")
            ]
        )

    # Aggregate other image features
    if len(np.setdiff1d(image_feature_categories, [count_prefix])) != 0:
        image_features_df = image_features_df.drop(
            remove_cols, axis="columns", errors="ignore"
        )
        features = list(np.setdiff1d(list(image_features_df.columns), strata))
        image_features_df = aggregate.aggregate(
            population_df=image_features_df,
            strata=strata,
            features=features,
            operation=aggregation_operation,
        )

        df = df.merge(image_features_df, on=strata, how="left")

    return df

`pycytominer.cyto_utils.features` ¶

Utility function to manipulate cell profiler features.

`convert_compartment_format_to_list(compartments)` ¶

Convert cell painting compartments to a list.

Parameters:

Name	Type	Description	Default
`compartments`	`list of str or str`	Cell Painting compartment(s).	required

Returns:

Name	Type	Description
`compartments`	`list of str`	List of Cell Painting compartments.

Source code in pycytominer/cyto_utils/features.py

def convert_compartment_format_to_list(compartments):
    """Convert cell painting compartments to a list.

    Parameters
    ----------
    compartments : list of str or str
        Cell Painting compartment(s).

    Returns
    -------
    compartments : list of str
        List of Cell Painting compartments.
    """
    if isinstance(compartments, list):
        compartments = [x.lower() for x in compartments]
    elif isinstance(compartments, str):
        compartments = [compartments.lower()]

    return compartments

`count_na_features(population_df, features)` ¶

Given a population dataframe and features, count how many nas per feature.

Parameters:

Name	Type	Description	Default
`population_df`	`DataFrame`	DataFrame of profiles.	required
`features`	`list of str`	Features present in the population dataframe.	required

Returns:

Type	Description
`Dataframe of NA counts per feature`

Source code in pycytominer/cyto_utils/features.py

def count_na_features(population_df, features):
    """Given a population dataframe and features, count how many nas per feature.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame of profiles.
    features : list of str
        Features present in the population dataframe.

    Returns
    -------
    Dataframe of NA counts per feature
    """
    return pd.DataFrame(population_df.loc[:, features].isna().sum(), columns=["num_na"])

`drop_outlier_features(population_df, features='infer', samples='all', outlier_cutoff=500)` ¶

Exclude a feature if its min or max absolute value is greater than the threshold.

Parameters:

Name	Type	Description	Default
`population_df`	`DataFrame`	DataFrame that includes metadata and observation features.	required
`features`	`list of str or str`	Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"	`"infer"`
`samples`	`str`	List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is "Metadata_treatment == 'control'" (include all quotes). If "all", use all samples to calculate.	`"all"`
`outlier_cutoff`	`int or float`	see https://github.com/cytomining/pycytominer/issues/237 for details. Threshold to remove features if absolute values is greater	`500`

Returns:

Name	Type	Description
`outlier_features`	`list of str`	Features greater than the threshold.

Source code in pycytominer/cyto_utils/features.py

def drop_outlier_features(
    population_df, features="infer", samples="all", outlier_cutoff=500
):
    """Exclude a feature if its min or max absolute value is greater than the threshold.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame that includes metadata and observation features.
    features : list of str or str, default "infer"
        Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"
    samples : str, default "all"
        List of samples to perform operation on. The function uses a pd.DataFrame.query()
        function, so you should  structure samples in this fashion. An example is
        "Metadata_treatment == 'control'" (include all quotes).
        If "all", use all samples to calculate.
    outlier_cutoff : int or float, default 500
        see https://github.com/cytomining/pycytominer/issues/237 for details.
        Threshold to remove features if absolute values is greater

    Returns
    -------
    outlier_features: list of str
        Features greater than the threshold.
    """
    # Subset dataframe
    if samples != "all":
        population_df.query(samples, inplace=True)

    if features == "infer":
        features = infer_cp_features(population_df)
        population_df = population_df.loc[:, features]
    else:
        population_df = population_df.loc[:, features]

    max_feature_values = population_df.max().abs()
    min_feature_values = population_df.min().abs()

    outlier_features = max_feature_values[
        (max_feature_values > outlier_cutoff) | (min_feature_values > outlier_cutoff)
    ].index.tolist()

    return outlier_features

`get_blocklist_features(blocklist_file=blocklist_file, population_df=None)` ¶

Get a list of blocklist features.

Parameters:

Name	Type	Description	Default
`blocklist_file`	`path-like object`	Location of the dataframe with features to exclude.	`blocklist_file`
`population_df`	`DataFrame`	Profile dataframe used to subset blocklist features.	`None`

Returns:

Name	Type	Description
`blocklist_features`	`list of str`	Features to exclude from downstream analysis.

Source code in pycytominer/cyto_utils/features.py

def get_blocklist_features(blocklist_file=blocklist_file, population_df=None):
    """Get a list of blocklist features.

    Parameters
    ----------
    blocklist_file : path-like object
        Location of the dataframe with features to exclude.
    population_df : pandas.core.frame.DataFrame, optional
        Profile dataframe used to subset blocklist features.

    Returns
    -------
    blocklist_features : list of str
        Features to exclude from downstream analysis.
    """
    blocklist = pd.read_csv(blocklist_file)

    assert any(  # noqa: S101
        x == "blocklist" for x in blocklist.columns
    ), "one column must be named 'blocklist'"

    blocklist_features = blocklist.blocklist.to_list()
    if isinstance(population_df, pd.DataFrame):
        population_features = population_df.columns.tolist()
        blocklist_features = [x for x in blocklist_features if x in population_features]

    return blocklist_features

`infer_cp_features(population_df, compartments=['Cells', 'Nuclei', 'Cytoplasm'], metadata=False, image_features=False)` ¶

Given a dataframe, output features that we expect to be Cell Painting features.

Parameters:

Name	Type	Description	Default
`population_df`	`DataFrame`	DataFrame from which features are to be inferred.	required
`compartments`	`list of str`	Compartments from which Cell Painting features were extracted.	`["Cells", "Nuclei", "Cytoplasm"]`
`metadata`	`bool`	Whether or not to infer metadata features.	`False`
`image_features`	`bool`	Whether or not the profiles contain image features.	`False`

Returns:

Name	Type	Description
`features`	`list of str`	List of Cell Painting features.

Source code in pycytominer/cyto_utils/features.py

def infer_cp_features(
    population_df,
    compartments=["Cells", "Nuclei", "Cytoplasm"],
    metadata=False,
    image_features=False,
):
    """Given a dataframe, output features that we expect to be Cell Painting features.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame from which features are to be inferred.
    compartments : list of str, default ["Cells", "Nuclei", "Cytoplasm"]
        Compartments from which Cell Painting features were extracted.
    metadata : bool, default False
        Whether or not to infer metadata features.
    image_features : bool, default False
        Whether or not the profiles contain image features.

    Returns
    -------
    features: list of str
        List of Cell Painting features.
    """
    compartments = convert_compartment_format_to_list(compartments)
    compartments = [x.title() for x in compartments]

    if image_features:
        compartments = list({"Image", *compartments})

    features = []
    for col in population_df.columns.tolist():
        if any(col.startswith(x.title()) for x in compartments):
            features.append(col)

    if metadata:
        features = population_df.columns[
            population_df.columns.str.startswith("Metadata_")
        ].tolist()

    assert (  # noqa: S101
        len(features) > 0
    ), "No CP features found. Are you sure this dataframe is from CellProfiler?"

    return features

`label_compartment(cp_features, compartment, metadata_cols)` ¶

Assign compartment label to each features as a prefix.

Parameters:

Name	Type	Description	Default
`cp_features`	`list of str`	All features being used.	required
`compartment`	`str`	Measured compartment.	required
`metadata_cols`	`list`	Columns that should be considered metadata.	required

Returns:

Name	Type	Description
`cp_features`	`list of str`	Recoded column names with appropriate metadata and compartment labels.

Source code in pycytominer/cyto_utils/features.py

def label_compartment(cp_features, compartment, metadata_cols):
    """Assign compartment label to each features as a prefix.

    Parameters
    ----------
    cp_features : list of str
        All features being used.
    compartment : str
        Measured compartment.
    metadata_cols : list
        Columns that should be considered metadata.

    Returns
    -------
    cp_features: list of str
        Recoded column names with appropriate metadata and compartment labels.
    """
    compartment = compartment.Title()
    avail_compartments = ["Cells", "Cytoplasm", "Nuceli", "Image", "Barcode"]

    assert (  # noqa: S101
        compartment in avail_compartments
    ), f"provide valid compartment. One of: {avail_compartments}"

    cp_features = [
        f"Metadata_{x}" if x in metadata_cols else f"{compartment}_{x}"
        for x in cp_features
    ]

    return cp_features

`pycytominer.cyto_utils.load` ¶

Module for loading data from various file formats.

`infer_delim(file)` ¶

Sniff the delimiter in the given file.

Parameters:

Name	Type	Description	Default
`file`	`str`	File name	required

Return

the delimiter used in the dataframe (typically either tab or commas)

Source code in pycytominer/cyto_utils/load.py

def infer_delim(file: str):
    """
    Sniff the delimiter in the given file.

    Parameters
    ----------
    file : str
        File name

    Return
    ------
    the delimiter used in the dataframe (typically either tab or commas)
    """
    try:
        with open(file) as csvfile:
            line = csvfile.readline()
    except UnicodeDecodeError:
        with gzip.open(file, "r") as gzipfile:
            line = gzipfile.readline().decode()

    dialect = csv.Sniffer().sniff(line)

    return dialect.delimiter

`is_path_a_parquet_file(file)` ¶

Check if the provided file path is a parquet file.

Identify parquet files by inspecting the file extensions. If the file does not end with parquet, this will return False, else True.

Parameters:

Name	Type	Description	Default
`file`	`Union[str, PurePath]`	path to parquet file	required

Returns:

Type	Description
`bool`	Returns True if the file path contains `.parquet`, else it will return False

Raises:

Type	Description
`TypeError`	Raised if a non str or non-path object is passed in the `file` parameter
`FileNotFoundError`	Raised if the provided path in the `file` does not exist

Source code in pycytominer/cyto_utils/load.py

def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool:
    """Check if the provided file path is a parquet file.

    Identify parquet files by inspecting the file extensions.
    If the file does not end with `parquet`, this will return False, else True.

    Parameters
    ----------
    file : Union[str, pathlib.PurePath]
        path to parquet file

    Returns
    -------
    bool
        Returns True if the file path contains `.parquet`, else it will return
        False

    Raises
    ------
    TypeError
        Raised if a non str or non-path object is passed in the `file` parameter
    FileNotFoundError
        Raised if the provided path in the `file` does not exist
    """
    file = pathlib.PurePath(file)
    try:
        # strict=true tests if path exists
        file = pathlib.Path(file).resolve(strict=True)
    except FileNotFoundError as e:
        print("load_profiles() didn't find the path.", e, sep="\n")

    # Check if file path is a parquet file
    if file.suffix.lower() == ".parquet":
        return True

    return False

`load_npz_features(npz_file, fallback_feature_prefix='DP', metadata=True)` ¶

Load an npz file storing features and, sometimes, metadata.

The function will first search the .npz file for a metadata column called "Metadata_Model". If the field exists, the function uses this entry as the feature prefix. If it doesn't exist, use the fallback_feature_prefix.

If the npz file does not exist, this function returns an empty dataframe.

Parameters:

Name	Type	Description	Default
`npz_file`	`str`	file path to the compressed output (typically DeepProfiler output)	required
`fallback_feature_prefix`		a string to prefix all features [default: "DP"].	`'DP'`

Return

df : pandas.core.frame.DataFrame pandas DataFrame of profiles

Source code in pycytominer/cyto_utils/load.py

def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
    """
    Load an npz file storing features and, sometimes, metadata.

    The function will first search the .npz file for a metadata column called
    "Metadata_Model". If the field exists, the function uses this entry as the
    feature prefix. If it doesn't exist, use the fallback_feature_prefix.

    If the npz file does not exist, this function returns an empty dataframe.

    Parameters
    ----------
    npz_file : str
        file path to the compressed output (typically DeepProfiler output)
    fallback_feature_prefix :str
        a string to prefix all features [default: "DP"].

    Return
    ------
    df : pandas.core.frame.DataFrame
        pandas DataFrame of profiles
    """
    try:
        npz = np.load(npz_file, allow_pickle=True)
    except FileNotFoundError:
        return pd.DataFrame([])

    files = npz.files

    # Load features
    df = pd.DataFrame(npz["features"])

    if not metadata:
        return df

    # Load metadata
    if "metadata" in files:
        metadata = npz["metadata"].item()
        metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
        metadata_df.columns = [
            f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
        ]

        # Determine the appropriate metadata prefix
        if "Metadata_Model" in metadata_df.columns:
            feature_prefix = metadata_df.Metadata_Model.unique()[0]
        else:
            feature_prefix = fallback_feature_prefix
    else:
        feature_prefix = fallback_feature_prefix

    # Append feature prefix
    df.columns = [
        f"{feature_prefix}_{x}" if not str(x).startswith(feature_prefix) else x
        for x in df
    ]

    # Append metadata with features
    if "metadata" in files:
        df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)

    return df

`load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1)` ¶

Load an npz file storing locations and, sometimes, metadata.

The function will first search the .npz file for a metadata column called "locations". If the field exists, the function uses this entry as the feature prefix.

If the npz file does not exist, this function returns an empty dataframe.

Parameters:

Name	Type	Description	Default
`npz_file`	`str`	file path to the compressed output (typically DeepProfiler output)	required
`location_x_col_index`		index of the x location column (which column in DP output has X coords)	`0`
`location_y_col_index`		index of the y location column (which column in DP output has Y coords)	`1`

Return

df : pandas.core.frame.DataFrame pandas DataFrame of profiles

Source code in pycytominer/cyto_utils/load.py

def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
    """
    Load an npz file storing locations and, sometimes, metadata.

    The function will first search the .npz file for a metadata column called
    "locations". If the field exists, the function uses this entry as the
    feature prefix.

    If the npz file does not exist, this function returns an empty dataframe.

    Parameters
    ----------
    npz_file : str
        file path to the compressed output (typically DeepProfiler output)
    location_x_col_index: int
        index of the x location column (which column in DP output has X coords)
    location_y_col_index: int
        index of the y location column (which column in DP output has Y coords)

    Return
    ------
    df : pandas.core.frame.DataFrame
        pandas DataFrame of profiles
    """
    try:
        npz = np.load(npz_file, allow_pickle=True)
    except FileNotFoundError:
        return pd.DataFrame([])

    # number of columns with data in the locations file
    num_location_cols = npz["locations"].shape[1]
    # throw error if user tries to index columns that don't exist
    if location_x_col_index >= num_location_cols:
        raise IndexError("OutOfBounds indexing via location_x_col_index")
    if location_y_col_index >= num_location_cols:
        raise IndexError("OutOfBounds indexing via location_y_col_index")

    df = pd.DataFrame(npz["locations"])
    df = df[[location_x_col_index, location_y_col_index]]
    df.columns = ["Location_Center_X", "Location_Center_Y"]
    return df

`load_platemap(platemap, add_metadata_id=True)` ¶

Unless a dataframe is provided, load the given platemap dataframe from path or string.

Parameters:

Name	Type	Description	Default
`platemap`	`pandas dataframe`	location or actual pandas dataframe of platemap file	required
`add_metadata_id`	`bool`	boolean if "Metadata_" should be appended to all platemap columns	`True`

Return

platemap : pandas.core.frame.DataFrame pandas DataFrame of profiles

Source code in pycytominer/cyto_utils/load.py

def load_platemap(platemap, add_metadata_id=True):
    """
    Unless a dataframe is provided, load the given platemap dataframe from path or string.

    Parameters
    ----------
    platemap : pandas dataframe
        location or actual pandas dataframe of platemap file

    add_metadata_id : bool
        boolean if "Metadata_" should be appended to all platemap columns

    Return
    ------
    platemap : pandas.core.frame.DataFrame
        pandas DataFrame of profiles
    """
    if not isinstance(platemap, pd.DataFrame):
        try:
            delim = infer_delim(platemap)
            platemap = pd.read_csv(platemap, sep=delim)
        except FileNotFoundError:
            raise FileNotFoundError(f"{platemap} platemap file not found")
    else:
        # Setting platemap to a copy to prevent column name changes from back-propagating
        platemap = platemap.copy()

    if add_metadata_id:
        platemap.columns = [
            f"Metadata_{x}" if not x.startswith("Metadata_") else x
            for x in platemap.columns
        ]
    return platemap

`load_profiles(profiles)` ¶

Unless a dataframe is provided, load the given profile dataframe from path or string.

Parameters:

Name	Type	Description	Default
`profiles`	`(str, Path, DataFrame)`	file location or actual pandas dataframe of profiles	`str`

Return

pandas DataFrame of profiles

Raises:

Type	Description
`FileNotFoundError`	Raised if the provided profile does not exists

Source code in pycytominer/cyto_utils/load.py

def load_profiles(profiles):
    """
    Unless a dataframe is provided, load the given profile dataframe from path or string.

    Parameters
    ----------
    profiles : {str, pathlib.Path, pandas.DataFrame}
        file location or actual pandas dataframe of profiles

    Return
    ------
    pandas DataFrame of profiles

    Raises
    ------
    FileNotFoundError
        Raised if the provided profile does not exists
    """
    if not isinstance(profiles, pd.DataFrame):
        # Check if path exists and load depending on file type
        if is_path_a_parquet_file(profiles):
            return pd.read_parquet(profiles, engine="pyarrow")

        else:
            delim = infer_delim(profiles)
            return pd.read_csv(profiles, sep=delim)

    return profiles

`pycytominer.cyto_utils.modz` ¶

Module for performing a modified z score transformation.

`modz(population_df, replicate_columns, features='infer', method='spearman', min_weight=0.01, precision=4)` ¶

Collapse replicates into a consensus signature using a weighted transformation.

Parameters:

Name	Type	Description	Default
`population_df`	`DataFrame`	DataFrame that includes metadata and observation features.	required
`replicate_columns`	`(str, list)`	a string or list of column(s) in the population dataframe that indicate replicate level information	required
`features`	`list`	List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_".	`"infer"`
`method`	`str`	indicating which correlation metric to use.	`"spearman"`
`min_weight`	`float`	the minimum correlation to clip all non-negative values lower to	`0.01`
`precision`	`int`	how many significant digits to round weights to	`4`

Returns:

Name	Type	Description
`modz_df`	`DataFrame`	Consensus signatures with metadata for all replicates in the given DataFrame

Source code in pycytominer/cyto_utils/modz.py

def modz(
    population_df,
    replicate_columns,
    features="infer",
    method="spearman",
    min_weight=0.01,
    precision=4,
):
    """Collapse replicates into a consensus signature using a weighted transformation.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame that includes metadata and observation features.
    replicate_columns : str, list
        a string or list of column(s) in the population dataframe that
        indicate replicate level information
    features : list, default "infer"
         List of features present in the population dataframe [default: "infer"]
         if "infer", then assume cell painting features are those that start with
         "Cells_", "Nuclei_", or "Cytoplasm_".
    method : str, default "spearman"
        indicating which correlation metric to use.
    min_weight : float, default 0.01
        the minimum correlation to clip all non-negative values lower to
    precision : int, default 4
        how many significant digits to round weights to

    Returns
    -------
    modz_df : pandas.core.frame.DataFrame
        Consensus signatures with metadata for all replicates in the given DataFrame
    """
    population_features = population_df.columns.tolist()
    assert_error = f"{replicate_columns} not in input dataframe"
    if isinstance(replicate_columns, list):
        assert all(x in population_features for x in replicate_columns), assert_error  # noqa: S101
    elif isinstance(replicate_columns, str):
        assert replicate_columns in population_features, assert_error  # noqa: S101
        replicate_columns = replicate_columns.split()
    else:
        return ValueError("replicate_columns must be a list or string")

    if features == "infer":
        features = infer_cp_features(population_df)

    subset_features = list(set(replicate_columns + features))
    population_df = population_df.loc[:, subset_features]

    modz_df = (
        population_df.groupby(replicate_columns)
        .apply(
            lambda x: modz_base(
                x.loc[:, features],
                method=method,
                min_weight=min_weight,
                precision=precision,
            )
        )
        .reset_index()
    )

    return modz_df

`modz_base(population_df, method='spearman', min_weight=0.01, precision=4)` ¶

Perform a modified z score transformation.

This code is modified from cmapPy. (see https://github.com/cytomining/pycytominer/issues/52). Note that this will apply the transformation to the FULL population_df. See modz() for replicate level procedures.

Parameters:

Name	Type	Description	Default
`population_df`	`DataFrame`	DataFrame that includes metadata and observation features.	required
`method`	`str`	indicating which correlation metric to use.	`"spearman"`
`min_weight`	`float`	the minimum correlation to clip all non-negative values lower to	`0.01`
`precision`	`int`	how many significant digits to round weights to	`4`

Returns:

Name	Type	Description
`modz_df`	`DataFrame`	modz transformed dataframe - a consensus signature of the input data weighted by replicate correlation

Source code in pycytominer/cyto_utils/modz.py

def modz_base(population_df, method="spearman", min_weight=0.01, precision=4):
    """Perform a modified z score transformation.

    This code is modified from cmapPy.
    (see https://github.com/cytomining/pycytominer/issues/52). Note that this will
    apply the transformation to the FULL population_df.
    See modz() for replicate level procedures.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame that includes metadata and observation features.
    method : str, default "spearman"
        indicating which correlation metric to use.
    min_weight : float, default 0.01
        the minimum correlation to clip all non-negative values lower to
    precision : int, default 4
        how many significant digits to round weights to

    Returns
    -------
    modz_df : pandas.core.frame.DataFrame
        modz transformed dataframe - a consensus signature of the input data
        weighted by replicate correlation
    """
    assert population_df.shape[0] > 0, "population_df must include at least one sample"  # noqa: S101

    method = check_correlation_method(method=method)

    # Step 1: Extract pairwise correlations of samples
    # Transpose so samples are columns
    population_df = population_df.transpose()
    cor_df, pair_df = get_pairwise_correlation(population_df, method=method)

    # Round correlation results
    pair_df = pair_df.round(precision)

    # create a copy of cor_df values for use with np.fill_diagonal
    cor_df_values = cor_df.values.copy()

    # Step 2: Identify sample weights
    # Fill diagonal of correlation_matrix with np.nan
    np.fill_diagonal(cor_df_values, np.nan)

    # reconstitute the changed data as a new dataframe to avoid read-only behavior
    cor_df = pd.DataFrame(
        data=cor_df_values, index=cor_df.index, columns=cor_df.columns
    )

    # Remove negative values
    cor_df = cor_df.clip(lower=0)

    # Get average correlation for each profile (will ignore NaN)
    raw_weights = cor_df.mean(axis=1)

    # Threshold weights (any value < min_weight will become min_weight)
    raw_weights = raw_weights.clip(lower=min_weight)

    # normalize raw_weights so that they add to 1
    weights = raw_weights / sum(raw_weights)
    weights = weights.round(precision)

    # Step 3: Normalize
    if population_df.shape[1] == 1:
        # There is only one sample (note that columns are now samples)
        modz_df = population_df.sum(axis=1)
    else:
        modz_df = population_df * weights
        modz_df = modz_df.sum(axis=1)

    return modz_df

`pycytominer.cyto_utils.output` ¶

Utility function to compress output data.

`check_compression_method(compression)` ¶

Ensure compression options are set properly.

Parameters:

Name	Type	Description	Default
`compression`	`str`	The category of compression options available	required

Returns:

Type	Description
`None`	Asserts available options

Source code in pycytominer/cyto_utils/output.py

def check_compression_method(compression: str):
    """Ensure compression options are set properly.

    Parameters
    ----------
    compression : str
        The category of compression options available

    Returns
    -------
    None
        Asserts available options
    """
    assert (  # noqa: S101
        compression in COMPRESS_OPTIONS
    ), f"{compression} is not supported, select one of {COMPRESS_OPTIONS}"

`output(df, output_filename, output_type='csv', sep=',', float_format=None, compression_options={'method': 'gzip', 'mtime': 1}, **kwargs)` ¶

Given an output file and compression options, write file to disk.

Parameters:

Name	Type	Description	Default
`df`	`pandas.core.frame.DataFrame`	a pandas dataframe that will be written to file	required
`output_filename`	`str`	location of file to write	required
`output_type`	`str`	type of output file to create	`"csv"`
`sep`	`str`	file delimiter	`','`
`float_format`	`str`	Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.	`None`
`compression_options`	`str or dict`	Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.	`{"method": "gzip", "mtime": 1}`

Returns:

Type	Description
`str`	returns output_filename

Examples:

import pandas as pd from pycytominer.cyto_utils import output

data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True)

output_file = "test.csv.gz" output( df=data_df, output_filename=output_file, sep=",", compression_options={"method": "gzip", "mtime": 1}, float_format=None, )

Source code in pycytominer/cyto_utils/output.py

def output(
    df: pd.DataFrame,
    output_filename: str,
    output_type: str = "csv",
    sep: str = ",",
    float_format: Optional[str] = None,
    compression_options: Union[str, Dict] = {"method": "gzip", "mtime": 1},
    **kwargs,
):
    """Given an output file and compression options, write file to disk.

    Parameters
    ----------
    df :  pandas.core.frame.DataFrame
        a pandas dataframe that will be written to file
    output_filename : str
        location of file to write
    output_type : str, default "csv"
        type of output file to create
    sep : str
        file delimiter
    float_format : str, default None
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    compression_options : str or dict, default {"method": "gzip", "mtime": 1}
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

    Returns
    -------
    str
        returns output_filename

    Examples
    --------
    import pandas as pd
    from pycytominer.cyto_utils import output

    data_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "a",
                    "Cells_x": [0.1, 0.3, 0.8],
                    "Nuclei_y": [0.5, 0.3, 0.1],
                }
            ),
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "b",
                    "Cells_x": [0.4, 0.2, -0.5],
                    "Nuclei_y": [-0.8, 1.2, -0.5],
                }
            ),
        ]
    ).reset_index(drop=True)

    output_file = "test.csv.gz"
    output(
        df=data_df,
        output_filename=output_file,
        sep=",",
        compression_options={"method": "gzip", "mtime": 1},
        float_format=None,
    )
    """
    if output_type == "csv":
        compression_options = set_compression_method(compression=compression_options)

        df.to_csv(
            path_or_buf=output_filename,
            sep=sep,
            index=False,
            float_format=float_format,
            compression=compression_options,
        )

    elif output_type == "parquet":
        # note: compression options will be validated against pd.DataFrame.to_parquet options
        # raising errors and tested through Pandas, PyArrow, etc. as necessary.
        df.to_parquet(path=output_filename, compression="snappy")

    return output_filename

`set_compression_method(compression)` ¶

Set the compression options.

Parameters:

Name	Type	Description	Default
`compression`	`str or dict`	Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.	required

Returns:

Type	Description
`(compression, dict)`	A formated dictionary expected by output()

Source code in pycytominer/cyto_utils/output.py

def set_compression_method(compression: Union[str, Dict]):
    """Set the compression options.

    Parameters
    ----------
    compression : str or dict
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

    Returns
    -------
    compression, dict
        A formated dictionary expected by output()
    """
    if compression is None:
        compression = {"method": None}

    if isinstance(compression, str):
        compression = {"method": compression}

    check_compression_method(compression["method"])
    return compression

`pycytominer.cyto_utils.single_cell_ingest_utils` ¶

Utility functions for single cell ingest.

`assert_linking_cols_complete(linking_cols='default', compartments='default')` ¶

Confirm that the linking cols and compartments are compatible.

Parameters:

Name	Type	Description	Default
`linking_cols`	`str or dict`	Specify how to link objects	`"default"`
`compartments`	`str or list`	Which compartments used in the experiment.	`"default"`

Returns:

Type	Description
`None`	Asserts linking columns are appropriately defined
`.. note::`	assert_linking_cols_complete() does not check if columns are present

Source code in pycytominer/cyto_utils/single_cell_ingest_utils.py

def assert_linking_cols_complete(linking_cols="default", compartments="default"):
    """Confirm that the linking cols and compartments are compatible.

    Parameters
    ----------
    linking_cols : str or dict, default "default"
        Specify how to link objects
    compartments : str or list, default "default"
        Which compartments used in the experiment.

    Returns
    -------
    None
        Asserts linking columns are appropriately defined

    .. note::
        assert_linking_cols_complete() does not check if columns are present
    """
    if linking_cols == "default":
        linking_cols = get_default_linking_cols()

    if compartments == "default":
        compartments = get_default_compartments()

    comp_err = "compartment not found. Check the specified compartments"

    linking_check = []
    unique_linking_cols = []
    for x in linking_cols:
        unique_linking_cols.append(x)
        assert x in compartments, f"{x} {comp_err}"  # noqa: S101
        for y in linking_cols[x]:
            unique_linking_cols.append(y)
            assert y in compartments, f"{y} {comp_err}"  # noqa: S101
            linking_check.append("-".join(sorted([x, y])))

    # Make sure that each combination has been specified exactly twice
    linking_counter = Counter(linking_check)
    for combo in linking_counter:
        assert linking_counter[combo] == 2, f"Missing column identifier between {combo}"  # noqa: S101

    # Confirm that every compartment has been specified in the linking_cols
    unique_linking_cols = sorted(set(unique_linking_cols))
    diff_column = set(compartments).difference(unique_linking_cols)
    assert (  # noqa: S101
        unique_linking_cols == sorted(compartments)
    ), f"All compartments must be specified in the linking_cols, {diff_column} is missing"

`get_default_linking_cols()` ¶

Define the standard experiment linking columns between tables.

Returns:

Type	Description
`(linking_cols, dict)`	A dictionary mapping columns that links together CellProfiler objects
`.. note::`	every dictionary pair has a 1 to 1 correspondence (e.g. cytoplasm-cells and cells-cytoplasm both must exist)

Source code in pycytominer/cyto_utils/single_cell_ingest_utils.py

def get_default_linking_cols():
    """Define the standard experiment linking columns between tables.

    Returns
    -------
    linking_cols, dict
        A dictionary mapping columns that links together CellProfiler objects

    .. note::
        every dictionary pair has a 1 to 1 correspondence (e.g. cytoplasm-cells and cells-cytoplasm both must exist)
    """
    linking_cols = {
        "cytoplasm": {
            "cells": "Cytoplasm_Parent_Cells",
            "nuclei": "Cytoplasm_Parent_Nuclei",
        },
        "cells": {"cytoplasm": "ObjectNumber"},
        "nuclei": {"cytoplasm": "ObjectNumber"},
    }

    return linking_cols

`provide_linking_cols_feature_name_update(linking_cols='default')` ¶

Output a dictionary to use to update pandas dataframe column names from linking cols in the Metadata.

Parameters:

Name	Type	Description	Default
`linking_cols`	`str or dict`	Specify how to link objects	`"default"`

Returns:

Type	Description
`(update_name, dict)`	Dictionary of the linking column names to update after they are used

Source code in pycytominer/cyto_utils/single_cell_ingest_utils.py

def provide_linking_cols_feature_name_update(linking_cols="default"):
    """Output a dictionary to use to update pandas dataframe column names from linking cols in the Metadata.

    Parameters
    ----------
    linking_cols : str or dict, default "default"
        Specify how to link objects

    Returns
    -------
    update_name, dict
        Dictionary of the linking column names to update after they are used
    """
    if linking_cols == "default":
        linking_cols = get_default_linking_cols()

    metadata_update_cols = []
    for col in linking_cols:
        for right_col in linking_cols[col]:
            metadata_update_cols.append(linking_cols[col][right_col])

    update_name = dict(
        zip(
            metadata_update_cols,
            [f"Metadata_{y}" for y in metadata_update_cols],
        )
    )
    return update_name

`pycytominer.cyto_utils.util` ¶

Miscellaneous utility functions.

`check_aggregate_operation(operation)` ¶

Confirm that the input operation for aggregation is currently supported.

Parameters:

Name	Type	Description	Default
`operation`	`str`	Aggregation operation to provide.	required

Returns:

Type	Description
`str`	Correctly formatted operation method.

Source code in pycytominer/cyto_utils/util.py

def check_aggregate_operation(operation):
    """Confirm that the input operation for aggregation is currently supported.

    Parameters
    ----------
    operation : str
        Aggregation operation to provide.

    Returns
    -------
    str
        Correctly formatted operation method.

    """
    operation = operation.lower()
    avail_ops = ["mean", "median"]
    assert (  # noqa: S101
        operation in avail_ops
    ), f"operation {operation} not supported, select one of {avail_ops}"

    return operation

`check_compartments(compartments)` ¶

Check if the input compartments are noncanonical compartments.

Parameters:

Name	Type	Description	Default
`compartments`	`list of str`	Input compartments.	required

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/util.py

def check_compartments(compartments):
    """Check if the input compartments are noncanonical compartments.

    Parameters
    ----------
    compartments : list of str
        Input compartments.

    Returns
    -------
    None
        Nothing is returned.

    """
    default_compartments = get_default_compartments()

    compartments = convert_compartment_format_to_list(compartments)

    non_canonical_compartments = []
    for compartment in compartments:
        if compartment not in default_compartments:
            non_canonical_compartments.append(compartment)

    if len(non_canonical_compartments) > 0:
        warn_str = "Non-canonical compartment detected: {x}".format(
            x=", ".join(non_canonical_compartments)
        )
        warnings.warn(warn_str)

`check_consensus_operation(operation)` ¶

Confirm that the input operation for consensus is currently supported.

Parameters:

Name	Type	Description	Default
`operation`		Consensus operation to provide.	required

Returns:

Type	Description
`str`	Correctly formatted operation method.

Source code in pycytominer/cyto_utils/util.py

def check_consensus_operation(operation):
    """Confirm that the input operation for consensus is currently supported.

    Parameters
    ----------
    operation: str
        Consensus operation to provide.

    Returns
    -------
    str
        Correctly formatted operation method.

    """
    operation = operation.lower()
    avail_ops = ["modz"]  # All aggregation operations are also supported
    try:
        operation = check_aggregate_operation(operation)
    except AssertionError:
        assert (  # noqa: S101
            operation in avail_ops
        ), f"operation {operation} not supported, select one of {avail_ops} or see aggregate.py"

    return operation

`check_correlation_method(method)` ¶

Confirm that the input method is currently supported.

Parameters:

Name	Type	Description	Default
`method`	`str`	The correlation metric to use.	required

Returns:

Type	Description
`str`	Correctly formatted correlation method.

Source code in pycytominer/cyto_utils/util.py

def check_correlation_method(method):
    """Confirm that the input method is currently supported.

    Parameters
    ----------
    method : str
        The correlation metric to use.

    Returns
    -------
    str
        Correctly formatted correlation method.

    """
    method = method.lower()
    avail_methods = ["pearson", "spearman", "kendall"]
    assert (  # noqa: S101
        method in avail_methods
    ), f"method {method} not supported, select one of {avail_methods}"

    return method

`check_fields_of_view(data_fields_of_view, input_fields_of_view)` ¶

Confirm that the input list of fields of view is a subset of the list of fields of view in the image table.

Parameters:

Name	Type	Description	Default
`data_fields_of_view`	`list of int`	Fields of view in the image table.	required
`input_fields_of_view`	`list of int`	Input fields of view.	required

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/util.py

def check_fields_of_view(data_fields_of_view, input_fields_of_view):
    """Confirm that the input list of fields of view is a subset of the list of fields of view in the image table.

    Parameters
    ----------
    data_fields_of_view : list of int
        Fields of view in the image table.
    input_fields_of_view : list of int
        Input fields of view.

    Returns
    -------
    None
        Nothing is returned.

    """
    try:
        assert len(  # noqa: S101
            list(np.intersect1d(data_fields_of_view, input_fields_of_view))
        ) == len(input_fields_of_view)
    except AssertionError:
        raise ValueError(
            "Some of the input fields of view are not present in the image table."
        )

`check_fields_of_view_format(fields_of_view)` ¶

Confirm that the input fields of view is valid.

Parameters:

Name	Type	Description	Default
`fields_of_view`	`list of int`	List of integer fields of view.	required

Returns:

Type	Description
`str or list of int`	Correctly formatted fields_of_view variable.

Source code in pycytominer/cyto_utils/util.py

def check_fields_of_view_format(fields_of_view):
    """Confirm that the input fields of view is valid.

    Parameters
    ----------
    fields_of_view : list of int
        List of integer fields of view.

    Returns
    -------
    str or list of int
        Correctly formatted fields_of_view variable.

    """
    if fields_of_view != "all":
        if isinstance(fields_of_view, list):
            if all(isinstance(x, int) for x in fields_of_view):
                return fields_of_view
            else:
                try:
                    return list(map(int, fields_of_view))
                except ValueError:
                    raise TypeError(
                        "Variables of type int expected, however some of the input fields of view are not integers."
                    )
        else:
            raise TypeError(
                f"Variable of type list expected, however type {type(fields_of_view)} was passed."
            )
    else:
        return fields_of_view

`check_image_features(image_features, image_columns)` ¶

Confirm that the input list of image features are present in the image table.

Parameters:

Name	Type	Description	Default
`image_features`		Input image features to extract from the image table.	required
`image_columns`		Columns in the image table	required

Returns:

Type	Description
`None`	Nothing is returned.

Source code in pycytominer/cyto_utils/util.py

def check_image_features(image_features, image_columns):
    """Confirm that the input list of image features are present in the image table.

    Parameters
    ----------
    image_features: list of str
        Input image features to extract from the image table.
    image_columns: list of str
        Columns in the image table

    Returns
    -------
    None
        Nothing is returned.
    """
    if "Image" in list({img_col.split("_")[0] for img_col in image_columns}):
        # Image has already been prepended to most, but not all, columns
        level = 1
        image_columns = [x for x in image_columns if "_" in x]
    else:
        level = 0

    try:
        assert all(  # noqa: S101
            feature in list({img_col.split("_")[level] for img_col in image_columns})
            for feature in image_features
        )
    except AssertionError:
        raise ValueError(
            "Some of the input image features are not present in the image table."
        )

`extract_image_features(image_feature_categories, image_df, image_cols, strata)` ¶

Confirm that the input list of image features categories are present in the image table and then extract those features.

Parameters:

Name	Type	Description	Default
`image_feature_categories`	`list of str`	Input image feature groups to extract from the image table.	required
`image_df`	`DataFrame`	Image dataframe.	required
`image_cols`	`list of str`	Columns to select from the image table.	required
`strata`	`list of str`	The columns to groupby and aggregate single cells.	required

Returns:

Name	Type	Description
`image_features_df`	`DataFrame`	Dataframe with extracted image features.
`image_feature_categories`	`list of str`	Correctly formatted image feature categories.

Source code in pycytominer/cyto_utils/util.py

def extract_image_features(image_feature_categories, image_df, image_cols, strata):
    """Confirm that the input list of image features categories are present in the image table and then extract those features.

    Parameters
    ----------
    image_feature_categories : list of str
        Input image feature groups to extract from the image table.
    image_df : pandas.core.frame.DataFrame
        Image dataframe.
    image_cols : list of str
        Columns to select from the image table.
    strata :  list of str
        The columns to groupby and aggregate single cells.

    Returns
    -------
    image_features_df : pandas.core.frame.DataFrame
        Dataframe with extracted image features.
    image_feature_categories : list of str
        Correctly formatted image feature categories.

    """
    # Check if the input image feature groups are valid.
    check_image_features(image_feature_categories, list(image_df.columns))

    # Extract Image features from image_feature_categories
    image_features = list(
        image_df.columns[
            image_df.columns.str.startswith(tuple(image_feature_categories))
        ]
    )

    image_features_df = image_df[image_features]

    image_features_df.columns = [
        f"Image_{x}"
        if not x.startswith("Image_") and not x.startswith("Count_")
        else f"Metadata_{x}"
        if x.startswith("Count_")
        else x
        for x in image_features_df.columns
    ]

    # Add image_cols and strata to the dataframe
    image_features_df = pd.concat(
        [image_df[list(np.union1d(image_cols, strata))], image_features_df], axis=1
    )

    return image_features_df

`get_default_compartments()` ¶

Return default compartments.

Returns:

Type	Description
`list of str`	Default compartments.

Source code in pycytominer/cyto_utils/util.py

def get_default_compartments():
    """Return default compartments.

    Returns
    -------
    list of str
        Default compartments.

    """
    return ["cells", "cytoplasm", "nuclei"]

`get_pairwise_correlation(population_df, method='pearson')` ¶

Given a population dataframe, calculate all pairwise correlations.

Parameters:

Name	Type	Description	Default
`population_df`	`DataFrame`	Includes metadata and observation features.	required
`method`	`str`	Which correlation matrix to use to test cutoff.	`"pearson"`

Returns:

Type	Description
`list of str`	Features to exclude from the population_df.

Source code in pycytominer/cyto_utils/util.py

def get_pairwise_correlation(population_df, method="pearson"):
    """Given a population dataframe, calculate all pairwise correlations.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        Includes metadata and observation features.
    method : str, default "pearson"
        Which correlation matrix to use to test cutoff.

    Returns
    -------
    list of str
        Features to exclude from the population_df.

    """
    # Check that the input method is supported
    method = check_correlation_method(method)

    # Get a symmetrical correlation matrix. Use numpy for non NaN/Inf matrices.
    has_nan = np.any(np.isnan(population_df.values))
    has_inf = np.any(np.isinf(population_df.values))
    if method == "pearson" and not (has_nan or has_inf):
        pop_names = population_df.columns
        data_cor_df = np.corrcoef(population_df.transpose())
        data_cor_df = pd.DataFrame(data_cor_df, index=pop_names, columns=pop_names)
    else:
        data_cor_df = population_df.corr(method=method)

    # Create a copy of the dataframe to generate upper triangle of zeros
    data_cor_natri_df = data_cor_df.copy()

    # Replace upper triangle in correlation matrix with NaN
    data_cor_natri_df = data_cor_natri_df.where(
        np.tril(np.ones(data_cor_natri_df.shape), k=-1).astype(bool)
    )

    # Acquire pairwise correlations in a long format
    # Note that we are using the NaN upper triangle DataFrame
    pairwise_df = data_cor_natri_df.stack().reset_index()
    pairwise_df.columns = ["pair_a", "pair_b", "correlation"]

    return data_cor_df, pairwise_df

`load_known_metadata_dictionary(metadata_file=default_metadata_file)` ¶

Load previously known metadata columns per compartment from metadata text file.

Parameters:

Name	Type	Description	Default
`metadata_file`	`str`	File location of the metadata text file which should be a tab-separated file with two columns: ["compartment", "feature"]. If not provided, the default metadata file will be used.	`default_metadata_file`

Returns:

Type	Description
`dict`	Compartment (keys) mappings to previously known metadata (values).

Source code in pycytominer/cyto_utils/util.py

def load_known_metadata_dictionary(metadata_file=default_metadata_file):
    """Load previously known metadata columns per compartment from metadata text file.

    Parameters
    ----------
    metadata_file : str, optional
        File location of the metadata text file which should be a tab-separated file with two columns: ["compartment", "feature"].
        If not provided, the default metadata file will be used.

    Returns
    -------
    dict
        Compartment (keys) mappings to previously known metadata (values).

    """
    metadata_dict = {}
    with open(metadata_file) as meta_fh:
        next(meta_fh)
        for line in meta_fh:
            compartment, feature = line.strip().split("\t")
            compartment = compartment.lower()
            if compartment in metadata_dict:
                metadata_dict[compartment].append(feature)
            else:
                metadata_dict[compartment] = [feature]

    return metadata_dict

`pycytominer.cyto_utils.write_gct` ¶

Module to write a gct file from a pandas DataFrame.

Transform profiles into a gct (Gene Cluster Text) file A gct is a tab deliminted text file that traditionally stores gene expression data File Format Description: https://clue.io/connectopedia/gct_format.

Modified from cytominer_scripts "write_gcg" written in R https://github.com/broadinstitute/cytominer_scripts/blob/master/write_gct.R

`write_gct(profiles, output_file, features='infer', meta_features='infer', feature_metadata=None, version='#1.3')` ¶

Convert profiles to a .gct file.

Parameters:

Name	Type	Description	Default
`profiles`	`DataFrame`	DataFrame of profiles.	required
`output_file`	`str`	If provided, will write gct to file.	required
`features`	`list`	A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm".	`'infer'`
`meta_features`	`list`	A list of strings corresponding to metadata column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume metadata features are those prefixed with "Metadata"	`'infer'`
`feature_metadata`	`DataFrame`		`None`
`version`	`str`	Important for gct loading into Morpheus	`"#1.3"`

Returns:

Type	Description
`None`	Writes gct to file

Source code in pycytominer/cyto_utils/write_gct.py

def write_gct(
    profiles,
    output_file,
    features="infer",
    meta_features="infer",
    feature_metadata=None,
    version="#1.3",
):
    """Convert profiles to a .gct file.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame
        DataFrame of profiles.
    output_file : str
        If provided, will write gct to file.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
        If "infer", then assume metadata features are those prefixed with "Metadata"
    feature_metadata : pandas.core.frame.DataFrame, default None
    version : str, default "#1.3"
        Important for gct loading into Morpheus

    Returns
    -------
    None
        Writes gct to file
    """
    # Note, only version 1.3 is currently supported
    assert version == "#1.3", "Only version #1.3 is currently supported."  # noqa: S101

    # Step 1: Create first two rows of data
    if features == "infer":
        features = infer_cp_features(profiles)
    feature_df = profiles.loc[:, features].reset_index(drop=True).transpose()

    # Separate out metadata features
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)
    metadata_df = profiles.loc[:, meta_features]

    # Step 2: Get the sample metadata portion of the output file
    metadata_part = metadata_df.transpose()
    metadata_part.columns = [f"SAMPLE_{x}" for x in metadata_part.columns]
    metadata_part = (
        metadata_part.transpose()
        .reset_index()
        .rename({"index": "id"}, axis="columns")
        .transpose()
    )
    metadata_part.index = [x.replace("Metadata_", "") for x in metadata_part.index]

    nrow_feature, ncol_features = feature_df.shape
    _, ncol_metadata = metadata_df.shape

    # Step 3: Compile feature metadata
    full_df = pd.concat([metadata_part, feature_df], axis="rows")
    if isinstance(feature_metadata, pd.DataFrame):
        nrow_metadata = feature_metadata.shape[1]
        assert (  # noqa: S101
            "id" in feature_metadata.index.tolist()
        ), "make sure feature metadata has row named 'id' that stores feature metadata names!"
        full_df = feature_metadata.merge(
            full_df, how="right", left_index=True, right_index=True
        )
    else:
        feature_metadata = (
            ["cp_feature_name"] + [np.nan] * ncol_metadata + feature_df.index.tolist()
        )
        nrow_metadata = 1
        full_df.insert(0, column="feature_metadata", value=feature_metadata)
    full_df = full_df.reset_index()

    # Step 4: Compile all data dimensions
    data_dimensions = [nrow_feature, ncol_features, nrow_metadata, ncol_metadata]

    # Step 5: Write output gct file
    with open(output_file, "w", newline="") as gctfile:
        gctwriter = csv.writer(gctfile, delimiter="\t")
        gctwriter.writerow([version])
        gctwriter.writerow(data_dimensions)
        for feature, row in full_df.iterrows():
            gctwriter.writerow(row)

Cyto utilities¶

pycytominer.cyto_utils.DeepProfiler_processing ¶

AggregateDeepProfiler ¶

__init__(deep_data, aggregate_operation='median', aggregate_on='well', output_file=None) ¶

aggregate_deep() ¶

setup_aggregate() ¶

DeepProfilerData ¶

__init__(index_file, profile_dir, filename_delimiter='_', file_extension='.npz') ¶

build_filename_from_index(row) ¶

build_filenames() ¶

extract_filename_metadata(npz_file, delimiter='_') ¶

SingleCellDeepProfiler ¶

__init__(deep_data) ¶

get_single_cells(output=False, location_x_col_index=0, location_y_col_index=1) ¶

pycytominer.cyto_utils.annotate_custom ¶

annotate_cmap(annotated, annotate_join_on, cell_id='unknown', perturbation_mode='none') ¶

cp_clean(profiles) ¶

pycytominer.cyto_utils.cell_locations ¶

CellLocation ¶

add_cell_location() ¶

pycytominer.cyto_utils.cell_locations_cmd ¶

pycytominer.cyto_utils.cells ¶

SingleCells ¶

aggregate_compartment(compartment, compute_subsample=False, compute_counts=False, add_image_features=False, n_aggregation_memory_strata=1) ¶

aggregate_profiles(compute_subsample=False, output_file=None, compression_options=None, float_format=None, n_aggregation_memory_strata=1, **kwargs) ¶

count_cells(compartment='cells', count_subset=False) ¶

count_sql_table_rows(table) ¶

get_sql_table_col_names(table) ¶

get_subsample(df=None, compartment='cells', rename_col=True) ¶

load_compartment(compartment) ¶

load_image(image_table_name=None) ¶

merge_single_cells(compute_subsample=False, sc_output_file=None, compression_options=None, float_format=None, single_cell_normalize=False, normalize_args=None, platemap=None, **kwargs) ¶

set_output_file(output_file) ¶

set_subsample_frac(subsample_frac) ¶

set_subsample_n(subsample_n) ¶

set_subsample_random_state(random_state) ¶

split_column_categories(col_names) ¶

subsample_profiles(df, rename_col=True) ¶

pycytominer.cyto_utils.collate ¶

run_check_errors(cmd) ¶

pycytominer.cyto_utils.collate_cmd ¶

pycytominer.cyto_utils.cp_image_features ¶

aggregate_fields_count(image_df, strata, fields_of_view_feature) ¶

aggregate_image_count_features(df, image_features_df, image_cols, strata, count_prefix='Count') ¶

aggregate_image_features(df, image_features_df, image_feature_categories, image_cols, strata, aggregation_operation, count_prefix='Count') ¶

pycytominer.cyto_utils.features ¶

convert_compartment_format_to_list(compartments) ¶

count_na_features(population_df, features) ¶

drop_outlier_features(population_df, features='infer', samples='all', outlier_cutoff=500) ¶

get_blocklist_features(blocklist_file=blocklist_file, population_df=None) ¶

infer_cp_features(population_df, compartments=['Cells', 'Nuclei', 'Cytoplasm'], metadata=False, image_features=False) ¶

label_compartment(cp_features, compartment, metadata_cols) ¶

pycytominer.cyto_utils.load ¶

infer_delim(file) ¶

is_path_a_parquet_file(file) ¶

load_npz_features(npz_file, fallback_feature_prefix='DP', metadata=True) ¶

load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1) ¶

load_platemap(platemap, add_metadata_id=True) ¶

load_profiles(profiles) ¶

pycytominer.cyto_utils.modz ¶

modz(population_df, replicate_columns, features='infer', method='spearman', min_weight=0.01, precision=4) ¶

modz_base(population_df, method='spearman', min_weight=0.01, precision=4) ¶

pycytominer.cyto_utils.output ¶

check_compression_method(compression) ¶

output(df, output_filename, output_type='csv', sep=',', float_format=None, compression_options={'method': 'gzip', 'mtime': 1}, **kwargs) ¶

set_compression_method(compression) ¶

pycytominer.cyto_utils.single_cell_ingest_utils ¶

assert_linking_cols_complete(linking_cols='default', compartments='default') ¶

get_default_linking_cols() ¶

provide_linking_cols_feature_name_update(linking_cols='default') ¶

pycytominer.cyto_utils.util ¶

check_aggregate_operation(operation) ¶

check_compartments(compartments) ¶

check_consensus_operation(operation) ¶

check_correlation_method(method) ¶

check_fields_of_view(data_fields_of_view, input_fields_of_view) ¶

check_fields_of_view_format(fields_of_view) ¶

check_image_features(image_features, image_columns) ¶

extract_image_features(image_feature_categories, image_df, image_cols, strata) ¶

get_default_compartments() ¶

`pycytominer.cyto_utils.DeepProfiler_processing` ¶

`AggregateDeepProfiler` ¶

`init(deep_data, aggregate_operation='median', aggregate_on='well', output_file=None)` ¶

`aggregate_deep()` ¶

`setup_aggregate()` ¶

`DeepProfilerData` ¶

`init(index_file, profile_dir, filename_delimiter='_', file_extension='.npz')` ¶

`build_filename_from_index(row)` ¶

`build_filenames()` ¶

`extract_filename_metadata(npz_file, delimiter='_')` ¶

`SingleCellDeepProfiler` ¶

`init(deep_data)` ¶

`get_single_cells(output=False, location_x_col_index=0, location_y_col_index=1)` ¶

`pycytominer.cyto_utils.annotate_custom` ¶

`annotate_cmap(annotated, annotate_join_on, cell_id='unknown', perturbation_mode='none')` ¶

`cp_clean(profiles)` ¶

`pycytominer.cyto_utils.cell_locations` ¶

`CellLocation` ¶

`add_cell_location()` ¶

`pycytominer.cyto_utils.cell_locations_cmd` ¶

`pycytominer.cyto_utils.cells` ¶

`SingleCells` ¶

`aggregate_compartment(compartment, compute_subsample=False, compute_counts=False, add_image_features=False, n_aggregation_memory_strata=1)` ¶

`aggregate_profiles(compute_subsample=False, output_file=None, compression_options=None, float_format=None, n_aggregation_memory_strata=1, **kwargs)` ¶

`count_cells(compartment='cells', count_subset=False)` ¶

`count_sql_table_rows(table)` ¶

`get_sql_table_col_names(table)` ¶

`get_subsample(df=None, compartment='cells', rename_col=True)` ¶

`load_compartment(compartment)` ¶

`load_image(image_table_name=None)` ¶

`merge_single_cells(compute_subsample=False, sc_output_file=None, compression_options=None, float_format=None, single_cell_normalize=False, normalize_args=None, platemap=None, **kwargs)` ¶

`set_output_file(output_file)` ¶

`set_subsample_frac(subsample_frac)` ¶

`set_subsample_n(subsample_n)` ¶

`set_subsample_random_state(random_state)` ¶

`split_column_categories(col_names)` ¶

`subsample_profiles(df, rename_col=True)` ¶

`pycytominer.cyto_utils.collate` ¶

`run_check_errors(cmd)` ¶

`pycytominer.cyto_utils.collate_cmd` ¶

`pycytominer.cyto_utils.cp_image_features` ¶

`aggregate_fields_count(image_df, strata, fields_of_view_feature)` ¶

`aggregate_image_count_features(df, image_features_df, image_cols, strata, count_prefix='Count')` ¶

`aggregate_image_features(df, image_features_df, image_feature_categories, image_cols, strata, aggregation_operation, count_prefix='Count')` ¶

`pycytominer.cyto_utils.features` ¶

`convert_compartment_format_to_list(compartments)` ¶

`count_na_features(population_df, features)` ¶

`drop_outlier_features(population_df, features='infer', samples='all', outlier_cutoff=500)` ¶

`get_blocklist_features(blocklist_file=blocklist_file, population_df=None)` ¶

`infer_cp_features(population_df, compartments=['Cells', 'Nuclei', 'Cytoplasm'], metadata=False, image_features=False)` ¶

`label_compartment(cp_features, compartment, metadata_cols)` ¶

`pycytominer.cyto_utils.load` ¶

`infer_delim(file)` ¶

`is_path_a_parquet_file(file)` ¶

`load_npz_features(npz_file, fallback_feature_prefix='DP', metadata=True)` ¶

`load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1)` ¶

`load_platemap(platemap, add_metadata_id=True)` ¶

`load_profiles(profiles)` ¶

`pycytominer.cyto_utils.modz` ¶

`modz(population_df, replicate_columns, features='infer', method='spearman', min_weight=0.01, precision=4)` ¶

`modz_base(population_df, method='spearman', min_weight=0.01, precision=4)` ¶

`pycytominer.cyto_utils.output` ¶

`check_compression_method(compression)` ¶

`output(df, output_filename, output_type='csv', sep=',', float_format=None, compression_options={'method': 'gzip', 'mtime': 1}, **kwargs)` ¶

`set_compression_method(compression)` ¶

`pycytominer.cyto_utils.single_cell_ingest_utils` ¶

`assert_linking_cols_complete(linking_cols='default', compartments='default')` ¶

`get_default_linking_cols()` ¶

`provide_linking_cols_feature_name_update(linking_cols='default')` ¶

`pycytominer.cyto_utils.util` ¶

`check_aggregate_operation(operation)` ¶

`check_compartments(compartments)` ¶

`check_consensus_operation(operation)` ¶

`check_correlation_method(method)` ¶

`check_fields_of_view(data_fields_of_view, input_fields_of_view)` ¶

`check_fields_of_view_format(fields_of_view)` ¶

`check_image_features(image_features, image_columns)` ¶

`extract_image_features(image_feature_categories, image_df, image_cols, strata)` ¶

`get_default_compartments()` ¶

`get_pairwise_correlation(population_df, method='pearson')` ¶