Skip to content

Main Functions

Pycytominer is a suite of common functions used to process high dimensional readouts from high-throughput cell experiments.

pycytominer.aggregate

Aggregate profiles based on given grouping variables.

aggregate(population_df, strata=['Metadata_Plate', 'Metadata_Well'], features='infer', operation='median', output_file=None, output_type='csv', compute_object_count=False, object_feature='Metadata_ObjectNumber', subset_data_df=None, compression_options=None, float_format=None)

Combine population dataframe variables by strata groups using given operation.

Parameters:

Name Type Description Default
population_df DataFrame

DataFrame to group and aggregate.

required
strata list of str

Columns to groupby and aggregate.

["Metadata_Plate", "Metadata_Well"]
features list of str

List of features that should be aggregated.

"infer"
operation str

How the data is aggregated. Currently only supports one of ['mean', 'median'].

"median"
output_file str or file handle

If provided, will write aggregated profiles to file. If not specified, will return the aggregated profiles. We recommend naming the file based on the plate name.

None
output_type str

If provided, will write aggregated profiles as a specified file type (either CSV or parquet). If not specified and output_file is provided, then the file will be outputed as CSV as default.

'csv'
compute_object_count bool

Whether or not to compute object counts.

False
object_feature str

Object number feature. Only used if compute_object_count=True.

"Metadata_ObjectNumber"
subset_data_df DataFrame

How to subset the input.

None
compression_options str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

None
float_format str

Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.

None

Returns:

Name Type Description
population_df (DataFrame, optional)

DataFrame of aggregated features. If output_file=None, then return the DataFrame. If you specify output_file, then write to file and do not return data.

Source code in pycytominer/aggregate.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def aggregate(
    population_df: pd.DataFrame,
    strata: List[str] = ["Metadata_Plate", "Metadata_Well"],
    features: Union[List[str], str] = "infer",
    operation: str = "median",
    output_file: Optional[str] = None,
    output_type: Optional[str] = "csv",
    compute_object_count: bool = False,
    object_feature: str = "Metadata_ObjectNumber",
    subset_data_df: Optional[pd.DataFrame] = None,
    compression_options: Optional[Union[str, Dict[str, Any]]] = None,
    float_format: Optional[str] = None,
) -> Optional[pd.DataFrame]:
    """Combine population dataframe variables by strata groups using given operation.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame to group and aggregate.
    strata : list of str, default ["Metadata_Plate", "Metadata_Well"]
        Columns to groupby and aggregate.
    features : list of str, default "infer"
        List of features that should be aggregated.
    operation : str, default "median"
        How the data is aggregated. Currently only supports one of ['mean', 'median'].
    output_file : str or file handle, optional
        If provided, will write aggregated profiles to file. If not specified, will return the aggregated profiles.
        We recommend naming the file based on the plate name.
    output_type : str, optional
        If provided, will write aggregated profiles as a specified file type (either CSV or parquet).
        If not specified and output_file is provided, then the file will be outputed as CSV as default.
    compute_object_count : bool, default False
        Whether or not to compute object counts.
    object_feature : str, default "Metadata_ObjectNumber"
        Object number feature. Only used if compute_object_count=True.
    subset_data_df : pandas.core.frame.DataFrame
        How to subset the input.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.

    Returns
    -------
    population_df : pandas.core.frame.DataFrame, optional
        DataFrame of aggregated features. If output_file=None, then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    """
    # Check that the operation is supported
    operation = check_aggregate_operation(operation)

    # Subset the data to specified samples
    if isinstance(subset_data_df, pd.DataFrame):
        population_df = subset_data_df.merge(
            population_df, how="inner", on=subset_data_df.columns.tolist()
        ).reindex(population_df.columns, axis="columns")

    # Subset dataframe to only specified variables if provided
    strata_df = population_df[strata]

    # Only extract single object column in preparation for count
    if compute_object_count:
        count_object_df = (
            population_df.loc[:, np.union1d(strata, [object_feature])]
            .groupby(strata)[object_feature]
            .count()
            .reset_index()
            .rename(columns={f"{object_feature}": "Metadata_Object_Count"})
        )

    if features == "infer":
        features = infer_cp_features(population_df)
    population_df = population_df[features]

    # Fix dtype of input features (they should all be floats!)
    population_df = population_df.astype(float)

    # Merge back metadata used to aggregate by
    population_df = pd.concat([strata_df, population_df], axis="columns")

    # Perform aggregating function
    population_df = population_df.groupby(strata, dropna=False)

    if operation == "median":
        population_df = population_df.median().reset_index()
    else:
        population_df = population_df.mean().reset_index()

    # Compute objects counts
    if compute_object_count:
        population_df = count_object_df.merge(population_df, on=strata, how="right")

    # Aggregated image number and object number do not make sense
    if columns_to_drop := [
        column
        for column in population_df.columns
        if column in ["ImageNumber", "ObjectNumber"]
    ]:
        population_df = population_df.drop([columns_to_drop], axis="columns")

    if output_file is not None:
        output(
            df=population_df,
            output_filename=output_file,
            output_type=output_type,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return population_df

pycytominer.annotate

Annotates profiles with metadata information.

annotate(profiles, platemap, join_on=['Metadata_well_position', 'Metadata_Well'], output_file=None, output_type='csv', add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=True, external_metadata=None, external_join_left=None, external_join_right=None, compression_options=None, float_format=None, cmap_args={}, **kwargs)

Add metadata to aggregated profiles.

Parameters:

Name Type Description Default
profiles DataFrame or file

DataFrame or file path of profiles.

required
platemap DataFrame or file

Dataframe or file path of platemap metadata.

required
join_on list or str

Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of add_metadata_id_to_platemap

["Metadata_well_position", "Metadata_Well"]
output_file str

If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv".

None
output_type str

If provided, will write annotated profiles as a specified file type (either CSV or parquet). If not specified and output_file is provided, then the file will be outputed as CSV as default.

'csv'
add_metadata_id_to_platemap bool

Whether the plate map variables possibly need "Metadata" pre-pended

True
format_broad_cmap bool

Whether we need to add columns to make compatible with Broad CMAP naming conventions.

False
clean_cellprofiler

Clean specific CellProfiler feature names.

True
external_metadata str

File with additional metadata information

None
external_join_left str

Merge column in the profile metadata.

None
external_join_right

Merge column in the external metadata.

None
compression_options str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

None
float_format str

Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.

None
cmap_args dict

Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details.

{}

Returns:

Name Type Description
annotated (DataFrame, optional)

DataFrame of annotated features. If output_file=None, then return the DataFrame. If you specify output_file, then write to file and do not return data.

Source code in pycytominer/annotate.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def annotate(
    profiles,
    platemap,
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file=None,
    output_type="csv",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    clean_cellprofiler=True,
    external_metadata=None,
    external_join_left=None,
    external_join_right=None,
    compression_options=None,
    float_format=None,
    cmap_args={},
    **kwargs,
):
    """Add metadata to aggregated profiles.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file path of profiles.
    platemap : pandas.core.frame.DataFrame or file
        Dataframe or file path of platemap metadata.
    join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"]
        Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap`
    output_file : str, optional
        If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv".
    output_type : str, optional
        If provided, will write annotated profiles as a specified file type (either CSV or parquet).
        If not specified and output_file is provided, then the file will be outputed as CSV as default.
    add_metadata_id_to_platemap : bool, default True
        Whether the plate map variables possibly need "Metadata" pre-pended
    format_broad_cmap : bool, default False
        Whether we need to add columns to make compatible with Broad CMAP naming conventions.
    clean_cellprofiler: bool, default True
        Clean specific CellProfiler feature names.
    external_metadata : str, optional
        File with additional metadata information
    external_join_left : str, optional
        Merge column in the profile metadata.
    external_join_right: str, optional
        Merge column in the external metadata.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    cmap_args : dict, default {}
        Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details.

    Returns
    -------
    annotated : pandas.core.frame.DataFrame, optional
        DataFrame of annotated features. If output_file=None, then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.
    """
    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(
        profiles,
        left_on=join_on[0],
        right_on=join_on[1],
        how="inner",
        suffixes=["_platemap", None],
    )
    if join_on[0] != join_on[1]:
        annotated = annotated.drop(join_on[0], axis="columns")

    # Add specific Connectivity Map (CMAP) formatting
    if format_broad_cmap:
        annotated = annotate_cmap(annotated, annotate_join_on=join_on[1], **cmap_args)

    if clean_cellprofiler:
        annotated = cp_clean(annotated)

    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata is not None:
            if not os.path.exists(external_metadata):
                raise FileNotFoundError(
                    f"external metadata at {external_metadata} does not exist"
                )

            external_metadata = pd.read_csv(external_metadata)
    else:
        # Make a copy of the external metadata to avoid modifying the original column names
        external_metadata = external_metadata.copy()

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            f"Metadata_{x}" if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (
            annotated.merge(
                external_metadata,
                left_on=external_join_left,
                right_on=external_join_right,
                how="left",
                suffixes=[None, "_external"],
            )
            .reset_index(drop=True)
            .drop_duplicates()
        )

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file is not None:
        output(
            df=annotated,
            output_filename=output_file,
            output_type=output_type,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return annotated

pycytominer.consensus

Acquire consensus signatures for input samples.

consensus(profiles, replicate_columns=['Metadata_Plate', 'Metadata_Well'], operation='median', features='infer', output_file=None, output_type='csv', compression_options=None, float_format=None, modz_args={'method': 'spearman'})

Form level 5 consensus profile data.

Parameters:

Name Type Description Default
profiles DataFrame or file

DataFrame or file of profiles.

required
replicate_columns list, defaults to ["Metadata_Plate", "Metadata_Well"]

Metadata columns indicating which replicates to collapse

['Metadata_Plate', 'Metadata_Well']
operation str, defaults to "median"

The method used to form consensus profiles.

'median'
features list

A list of strings corresponding to feature measurement column names in the profiles DataFrame. All features listed must be found in profiles. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm".

'infer'
output_file str

If provided, will write consensus profiles to file. If not specified, will return the normalized profiles as output.

None
output_type str

If provided, will write consensus profiles as a specified file type (either CSV or parquet). If not specified and output_file is provided, then the file will be outputed as CSV as default.

'csv'
compression_options str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

None
float_format str

Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.

None
modz_args dict

Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details.

{'method': 'spearman'}

Returns:

Name Type Description
consensus_df (DataFrame, optional)

The consensus profile DataFrame. If output_file=None, then return the DataFrame. If you specify output_file, then write to file and do not return data.

Examples:

import pandas as pd from pycytominer import consensus

data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True)

consensus_df = consensus( profiles=data_df, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file=None, )

Source code in pycytominer/consensus.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def consensus(
    profiles,
    replicate_columns=["Metadata_Plate", "Metadata_Well"],
    operation="median",
    features="infer",
    output_file=None,
    output_type="csv",
    compression_options=None,
    float_format=None,
    modz_args={"method": "spearman"},
):
    """Form level 5 consensus profile data.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file of profiles.
    replicate_columns : list, defaults to ["Metadata_Plate", "Metadata_Well"]
        Metadata columns indicating which replicates to collapse
    operation : str, defaults to "median"
        The method used to form consensus profiles.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    output_file : str, optional
        If provided, will write consensus profiles to file. If not specified, will
        return the normalized profiles as output.
    output_type : str, optional
        If provided, will write consensus profiles as a specified file type (either CSV or parquet).
        If not specified and output_file is provided, then the file will be outputed as CSV as default.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    modz_args : dict, optional
        Additional custom arguments passed as kwargs if operation="modz".
        See pycytominer.cyto_utils.modz for more details.

    Returns
    -------
    consensus_df : pandas.core.frame.DataFrame, optional
        The consensus profile DataFrame. If output_file=None, then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    Examples
    --------
    import pandas as pd
    from pycytominer import consensus

    data_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "a",
                    "Cells_x": [0.1, 0.3, 0.8],
                    "Nuclei_y": [0.5, 0.3, 0.1],
                }
            ),
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "b",
                    "Cells_x": [0.4, 0.2, -0.5],
                    "Nuclei_y": [-0.8, 1.2, -0.5],
                }
            ),
        ]
    ).reset_index(drop=True)

    consensus_df = consensus(
        profiles=data_df,
        replicate_columns=["Metadata_Plate", "Metadata_Well"],
        operation="median",
        features="infer",
        output_file=None,
    )
    """
    # Confirm that the operation is supported
    check_consensus_operation(operation)

    # Load Data
    profiles = load_profiles(profiles)

    if operation == "modz":
        consensus_df = modz(
            population_df=profiles,
            replicate_columns=replicate_columns,
            features=features,
            **modz_args,
        )
    else:
        consensus_df = aggregate(
            population_df=profiles,
            strata=replicate_columns,
            features=features,
            operation=operation,
            subset_data_df=None,
        )

    if output_file is not None:
        output(
            df=consensus_df,
            output_filename=output_file,
            output_type=output_type,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return consensus_df

pycytominer.feature_select

Select features to use in downstream analysis based on specified selection method.

feature_select(profiles, features='infer', image_features=False, samples='all', operation='variance_threshold', output_file=None, output_type='csv', na_cutoff=0.05, corr_threshold=0.9, corr_method='pearson', freq_cut=0.05, unique_cut=0.01, compression_options=None, float_format=None, blocklist_file=None, outlier_cutoff=500, noise_removal_perturb_groups=None, noise_removal_stdev_cutoff=None)

Perform feature selection based on the given operation.

Parameters:

Name Type Description Default
profiles DataFrame or file

DataFrame or file of profiles.

required
features list

A list of strings corresponding to feature measurement column names in the profiles DataFrame. All features listed must be found in profiles. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm".

'infer'
image_features

Whether the profiles contain image features.

False
samples list or str

Samples to provide operation on.

"all"
operation

Operations to perform on the input profiles.

'variance_threshold'
output_file str

If provided, will write feature selected profiles to file. If not specified, will return the feature selected profiles as output. We recommend that this output file be suffixed with "_normalized_variable_selected.csv".

None
output_type str

If provided, will write feature selected profiles as a specified file type (either CSV or parquet). If not specified and output_file is provided, then the file will be outputed as CSV as default.

'csv'
na_cutoff float

Proportion of missing values in a column to tolerate before removing.

0.05
corr_threshold float

Value between (0, 1) to exclude features above if any two features are correlated above this threshold.

0.9
corr_method str

Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson".

"pearson"
freq_cut float

Ratio (2nd most common feature val / most common). Must range between 0 and 1. Remove features lower than freq_cut. A low freq_cut will remove features that have large difference between the most common feature and second most common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...])

0.05
unique_cut

Ratio (num unique features / num samples). Must range between 0 and 1. Remove features less than unique cut. A low unique_cut will remove features that have very few different measurements compared to the number of samples.

0.01
compression_options str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

None
float_format str

Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.

None
blocklist_file str

File location of datafrmame with with features to exclude. Note that if "blocklist" in operation then will remove standard blocklist

None
outlier_cutoff float

The threshold at which the maximum or minimum value of a feature across a full experiment is excluded. Note that this procedure is typically applied after normalization.

500
noise_removal_perturb_groups

Perturbation groups corresponding to rows in profiles or the the name of the metadata column containing this information.

None
noise_removal_stdev_cutoff

Maximum mean feature standard deviation to be kept for noise removal, grouped by the identity of the perturbation from perturb_list. The data must already be normalized so that this cutoff can apply to all columns.

None

Returns:

Name Type Description
selected_df (DataFrame, optional)

The feature selected profile DataFrame. If output_file=None, then return the DataFrame. If you specify output_file, then write to file and do not return data.

Source code in pycytominer/feature_select.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def feature_select(
    profiles,
    features="infer",
    image_features=False,
    samples="all",
    operation="variance_threshold",
    output_file=None,
    output_type="csv",
    na_cutoff=0.05,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.01,
    compression_options=None,
    float_format=None,
    blocklist_file=None,
    outlier_cutoff=500,
    noise_removal_perturb_groups=None,
    noise_removal_stdev_cutoff=None,
):
    """Perform feature selection based on the given operation.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file of profiles.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    image_features: bool, default False
        Whether the profiles contain image features.
    samples : list or str, default "all"
        Samples to provide operation on.
    operation: list of str or str, default "variance_threshold
        Operations to perform on the input profiles.
    output_file : str, optional
        If provided, will write feature selected profiles to file. If not specified, will
        return the feature selected profiles as output. We recommend that this output file be
        suffixed with "_normalized_variable_selected.csv".
    output_type : str, optional
        If provided, will write feature selected profiles as a specified file type (either CSV or parquet).
        If not specified and output_file is provided, then the file will be outputed as CSV as default.
    na_cutoff : float, default 0.05
        Proportion of missing values in a column to tolerate before removing.
    corr_threshold : float, default 0.9
        Value between (0, 1) to exclude features above if any two features are correlated above this threshold.
    corr_method : str, default "pearson"
        Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson".
    freq_cut : float, default 0.05
        Ratio (2nd most common feature val / most common). Must range between 0 and 1.
        Remove features lower than freq_cut. A low freq_cut will remove features
        that have large difference between the most common feature and second most
        common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...])
    unique_cut: float, default 0.01
        Ratio (num unique features / num samples). Must range between 0 and 1.
        Remove features less than unique cut. A low unique_cut will remove features
        that have very few different measurements compared to the number of samples.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    blocklist_file : str, optional
        File location of datafrmame with with features to exclude. Note that if "blocklist" in operation then will remove standard blocklist
    outlier_cutoff : float, default 500
        The threshold at which the maximum or minimum value of a feature across a full experiment is excluded. Note that this procedure is typically applied after normalization.
    noise_removal_perturb_groups: str or list of str, optional
        Perturbation groups corresponding to rows in profiles or the the name of the metadata column containing this information.
    noise_removal_stdev_cutoff: float,optional
        Maximum mean feature standard deviation to be kept for noise removal, grouped by the identity of the perturbation from perturb_list. The data must already be normalized so that this cutoff can apply to all columns.

    Returns
    -------
    selected_df : pandas.core.frame.DataFrame, optional
        The feature selected profile DataFrame. If output_file=None, then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    """
    all_ops = [
        "variance_threshold",
        "correlation_threshold",
        "drop_na_columns",
        "blocklist",
        "drop_outliers",
        "noise_removal",
    ]

    # Make sure the user provides a supported operation
    if isinstance(operation, list):
        if not all(x in all_ops for x in operation):
            raise ValueError(
                f"Some operation(s) {operation} not supported. Choose {all_ops}"
            )
    elif isinstance(operation, str):
        if operation not in all_ops:
            raise ValueError(f"{operation} not supported. Choose {all_ops}")
        operation = operation.split()
    else:
        return ValueError("Operation must be a list or string")

    # Load Data
    profiles = load_profiles(profiles)

    if features == "infer":
        features = infer_cp_features(profiles, image_features=image_features)

    excluded_features = []
    for op in operation:
        if op == "variance_threshold":
            exclude = variance_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                freq_cut=freq_cut,
                unique_cut=unique_cut,
            )
        elif op == "drop_na_columns":
            exclude = get_na_columns(
                population_df=profiles,
                features=features,
                samples=samples,
                cutoff=na_cutoff,
            )
        elif op == "correlation_threshold":
            exclude = correlation_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                threshold=corr_threshold,
                method=corr_method,
            )
        elif op == "blocklist":
            if blocklist_file:
                exclude = get_blocklist_features(
                    population_df=profiles, blocklist_file=blocklist_file
                )
            else:
                exclude = get_blocklist_features(population_df=profiles)
        elif op == "drop_outliers":
            exclude = drop_outlier_features(
                population_df=profiles,
                features=features,
                samples=samples,
                outlier_cutoff=outlier_cutoff,
            )
        elif op == "noise_removal":
            exclude = noise_removal(
                population_df=profiles,
                features=features,
                samples=samples,
                noise_removal_perturb_groups=noise_removal_perturb_groups,
                noise_removal_stdev_cutoff=noise_removal_stdev_cutoff,
            )
        excluded_features += exclude
        features = [feat for feat in features if feat not in excluded_features]

    excluded_features = list(set(excluded_features))

    selected_df = profiles.drop(excluded_features, axis="columns")

    if output_file is not None:
        output(
            df=selected_df,
            output_filename=output_file,
            output_type=output_type,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return selected_df

pycytominer.normalize

Normalize observation features based on specified normalization method.

normalize(profiles, features='infer', image_features=False, meta_features='infer', samples='all', method='standardize', output_file=None, output_type='csv', compression_options=None, float_format=None, mad_robustize_epsilon=1e-18, spherize_center=True, spherize_method='ZCA-cor', spherize_epsilon=1e-06)

Normalize profiling features.

Parameters:

Name Type Description Default
profiles DataFrame or path

Either a pandas DataFrame or a file that stores profile data

required
features list

A list of strings corresponding to feature measurement column names in the profiles DataFrame. All features listed must be found in profiles. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm".

'infer'
image_features

Whether the profiles contain image features.

False
meta_features list

A list of strings corresponding to metadata column names in the profiles DataFrame. All features listed must be found in profiles. Defaults to "infer". If "infer", then assume metadata features are those prefixed with "Metadata"

'infer'
samples str

The metadata column values to use as a normalization reference. We often use control samples. The function uses a pd.query() function, so you should structure samples in this fashion. An example is "Metadata_treatment == 'control'" (include all quotes). Defaults to "all".

'all'
method str

How to normalize the dataframe. Defaults to "standardize". Check avail_methods for available normalization methods.

'standardize'
output_file str

If provided, will write normalized profiles to file. If not specified, will return the normalized profiles as output. We recommend that this output file be suffixed with "_normalized.csv".

None
output_type str

If provided, will write normalized profiles as a specified file type (either CSV or parquet). If not specified and output_file is provided, then the file will be outputed as CSV as default.

'csv'
compression_options str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options).

None
float_format str

Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.

None
mad_robustize_epsilon

The mad_robustize fudge factor parameter. The function only uses this variable if method = "mad_robustize". Set this to 0 if mad_robustize generates features with large values.

1e-18
spherize_center bool

If the function should center data before sphering (aka whitening). The function only uses this variable if method = "spherize". Defaults to True.

True
spherize_method str

The sphering (aka whitening) normalization selection. The function only uses this variable if method = "spherize". Defaults to "ZCA-corr". See :py:func:pycytominer.operations.transform for available spherize methods.

'ZCA-cor'
spherize_epsilon float

The sphering (aka whitening) fudge factor parameter. The function only uses this variable if method = "spherize".

1e-6.

Returns:

Name Type Description
normalized (DataFrame, optional)

The normalized profile DataFrame. If output_file=None, then return the DataFrame. If you specify output_file, then write to file and do not return data.

Examples:

import pandas as pd
from pycytominer import normalize

data_df = pd.DataFrame({
    "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
    "Metadata_treatment": [
        "drug",
        "drug",
        "control",
        "control",
        "drug",
        "drug",
        "control",
        "control",
    ],
    "x": [1, 2, 8, 2, 5, 5, 5, 1],
    "y": [3, 1, 7, 4, 5, 9, 6, 1],
    "z": [1, 8, 2, 5, 6, 22, 2, 2],
    "zz": [14, 46, 1, 6, 30, 100, 2, 2],
}).reset_index(drop=True)

normalized_df = normalize(
    profiles=data_df,
    features=["x", "y", "z", "zz"],
    meta_features="infer",
    samples="Metadata_treatment == 'control'",
    method="standardize",
)
Source code in pycytominer/normalize.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def normalize(
    profiles,
    features="infer",
    image_features=False,
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file=None,
    output_type="csv",
    compression_options=None,
    float_format=None,
    mad_robustize_epsilon=1e-18,
    spherize_center=True,
    spherize_method="ZCA-cor",
    spherize_epsilon=1e-6,
):
    """Normalize profiling features.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or path
        Either a pandas DataFrame or a file that stores profile data
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    image_features: bool, default False
        Whether the profiles contain image features.
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
        If "infer", then assume metadata features are those prefixed with "Metadata"
    samples : str
        The metadata column values to use as a normalization reference. We often use
        control samples. The function uses a pd.query() function, so you should
        structure samples in this fashion. An example is
        "Metadata_treatment == 'control'" (include all quotes). Defaults to "all".
    method : str
        How to normalize the dataframe. Defaults to "standardize". Check avail_methods
        for available normalization methods.
    output_file : str, optional
        If provided, will write normalized profiles to file. If not specified, will
        return the normalized profiles as output. We recommend that this output file be
        suffixed with "_normalized.csv".
    output_type : str, optional
        If provided, will write normalized profiles as a specified file type (either CSV or parquet).
        If not specified and output_file is provided, then the file will be outputed as CSV as default.
    compression_options : str or dict, optional
        Contains compression options as input to `pd.DataFrame.to_csv(compression=compression_options)`.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    mad_robustize_epsilon: float, optional
        The mad_robustize fudge factor parameter. The function only uses this variable if method = "mad_robustize". Set this to 0 if
        mad_robustize generates features with large values.
    spherize_center : bool
        If the function should center data before sphering (aka whitening). The
        function only uses this variable if method = "spherize". Defaults to True.
    spherize_method : str
        The sphering (aka whitening) normalization selection. The function only uses
        this variable if method = "spherize". Defaults to "ZCA-corr". See
        :py:func:`pycytominer.operations.transform` for available spherize methods.
    spherize_epsilon : float, default 1e-6.
        The sphering (aka whitening) fudge factor parameter. The function only uses
        this variable if method = "spherize".

    Returns
    -------
    normalized : pandas.core.frame.DataFrame, optional
        The normalized profile DataFrame. If output_file=None, then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    Examples
    --------
    ```python
    import pandas as pd
    from pycytominer import normalize

    data_df = pd.DataFrame({
        "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
        "Metadata_treatment": [
            "drug",
            "drug",
            "control",
            "control",
            "drug",
            "drug",
            "control",
            "control",
        ],
        "x": [1, 2, 8, 2, 5, 5, 5, 1],
        "y": [3, 1, 7, 4, 5, 9, 6, 1],
        "z": [1, 8, 2, 5, 6, 22, 2, 2],
        "zz": [14, 46, 1, 6, 30, 100, 2, 2],
    }).reset_index(drop=True)

    normalized_df = normalize(
        profiles=data_df,
        features=["x", "y", "z", "zz"],
        meta_features="infer",
        samples="Metadata_treatment == 'control'",
        method="standardize",
    )
    ```
    """
    # Load Data
    profiles = load_profiles(profiles)

    # Define which scaler to use
    method = method.lower()

    avail_methods = ["standardize", "robustize", "mad_robustize", "spherize"]
    if method not in avail_methods:
        raise ValueError(f"operation must be one {avail_methods}")

    if method == "standardize":
        scaler = StandardScaler()
    elif method == "robustize":
        scaler = RobustScaler()
    elif method == "mad_robustize":
        scaler = RobustMAD(epsilon=mad_robustize_epsilon)
    elif method == "spherize":
        scaler = Spherize(
            center=spherize_center,
            method=spherize_method,
            epsilon=spherize_epsilon,
            return_numpy=True,
        )

    if features == "infer":
        features = infer_cp_features(profiles, image_features=image_features)

    # Separate out the features and meta
    feature_df = profiles.loc[:, features]
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)

    meta_df = profiles.loc[:, meta_features]

    # Fit the sklearn scaler
    if samples == "all":
        fitted_scaler = scaler.fit(feature_df)
    else:
        # Subset to only the features measured in the sample query
        fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

    fitted_scaled = fitted_scaler.transform(feature_df)

    columns = fitted_scaler.columns if method == "spherize" else feature_df.columns

    feature_df = pd.DataFrame(
        fitted_scaled,
        columns=columns,
        index=feature_df.index,
    )

    normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

    if feature_df.shape != profiles.loc[:, features].shape:
        error_detail = "The number of rows and columns in the feature dataframe does not match the original dataframe"
        context = f"the `{method}` method in `pycytominer.normalize`"
        raise ValueError(f"{error_detail}. This is likely a bug in {context}")

    if (normalized.shape[0] != profiles.shape[0]) or (
        normalized.shape[1] != len(features) + len(meta_features)
    ):
        error_detail = "The number of rows and columns in the normalized dataframe does not match the original dataframe"
        context = f"the `{method}` method in `pycytominer.normalize`"
        raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

    if output_file is not None:
        output(
            df=normalized,
            output_filename=output_file,
            output_type=output_type,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return normalized