Skip to content

Cyto utilities

Functions enabling smooth interaction with CellProfiler and DeepProfiler output formats.

A variety of utility functions for working with cytominer data.

pycytominer.cyto_utils.DeepProfiler_processing

Utility function to load and process the output files of a DeepProfiler run.

AggregateDeepProfiler

Class that holds all functions needed to aggregate the DeepProfiler (DP) run.

Attributes:

Name Type Description
deep_data DeepProfilerData

DeepProfilerData object to load data from DeepProfiler project

aggregated_profiles DataFrame

df to hold the metadata and profiles.

file_aggregate dict

dict that holds the file names and metadata. Is used to load in the npz files in the correct order and grouping.

output_file str

If provided, will write annotated profiles to folder. Defaults to None.

Methods:

Name Description
aggregate_deep

Given an initialized AggregateDeepProfiler() class, run this function to output level 3 profiles (aggregated profiles with annotated metadata).

Example

import pathlib from pycytominer.cyto_utils import DeepProfiler_processing

index_file = pathlib.Path("path/to/index.csv") profile_dir = pathlib.Path("path/to/features/")

deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz") deep_aggregate = DeepProfiler_processing.AggregateDeepProfiler(deep_data) deep_aggregate = aggregate.aggregate_deep()

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
class AggregateDeepProfiler:
    """Class that holds all functions needed to aggregate the DeepProfiler (DP) run.

    Attributes
    ----------
    deep_data : DeepProfilerData
        DeepProfilerData object to load data from DeepProfiler project
    aggregated_profiles : pandas.DataFrame
        df to hold the metadata and profiles.
    file_aggregate : dict
        dict that holds the file names and metadata.
        Is used to load in the npz files in the correct order and grouping.
    output_file : str
        If provided, will write annotated profiles to folder. Defaults to None.

    Methods
    -------
    aggregate_deep()
        Given an initialized AggregateDeepProfiler() class, run this function to output
        level 3 profiles (aggregated profiles with annotated metadata).

    Example
    -------
    import pathlib
    from pycytominer.cyto_utils import DeepProfiler_processing

    index_file = pathlib.Path("path/to/index.csv")
    profile_dir = pathlib.Path("path/to/features/")

    deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz")
    deep_aggregate = DeepProfiler_processing.AggregateDeepProfiler(deep_data)
    deep_aggregate = aggregate.aggregate_deep()
    """

    def __init__(
        self,
        deep_data: DeepProfilerData,
        aggregate_operation="median",
        aggregate_on="well",
        output_file=None,
    ):
        """
        __init__ function for this class.

        Arguments
        ---------
        See above for all parameters.
        """
        assert aggregate_operation in [  # noqa: S101
            "median",
            "mean",
        ], "Input of aggregate_operation is incorrect, it must be either median or mean"
        assert aggregate_on in [  # noqa: S101
            "site",
            "well",
            "plate",
        ], "Input of aggregate_on is incorrect, it must be either site or well or plate"

        self.deep_data = deep_data
        self.aggregate_operation = aggregate_operation
        self.aggregate_on = aggregate_on
        self.output_file = output_file

    def setup_aggregate(self):
        """Set up the file_aggregate attribute.

        A helper function to `aggregate_deep` that aggregate the file_aggregate dictionary contains the file locations and metadata for each grouping.
        If for example we are grouping by well then the keys of self.file_aggregate would be:
        plate1/well1, plate1/well2, plate2/well1, etc.
        """
        if not hasattr(self.deep_data, "filenames"):
            self.deep_data.build_filenames()

        self.file_aggregate = {}
        for filename in self.deep_data.filenames:
            file_info = self.deep_data.extract_filename_metadata(
                filename, self.deep_data.filename_delimiter
            )
            file_key = file_info[self.aggregate_on]

            if self.aggregate_on == "site":
                file_key = (
                    f"{file_info['plate']}/{file_info['well']}_{file_info['site']}"
                )

            if self.aggregate_on == "well":
                file_key = f"{file_info['plate']}/{file_info['well']}"

            if file_key in self.file_aggregate:
                self.file_aggregate[file_key]["files"].append(filename)
            else:
                self.file_aggregate[file_key] = {}
                self.file_aggregate[file_key]["files"] = [filename]

            self.file_aggregate[file_key]["metadata"] = file_info

    def aggregate_deep(self):
        """
        Aggregate the DeepProfiler profiles into a pandas dataframe.

        For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated.
        If files are missing, we throw a warning but continue the code.
        After aggregation, the metadata is concatenated back onto the dataframe.

        Returns
        -------
        df_out : pandas.dataframe
            dataframe with all metadata and the feature space.
            This is the input to any further pycytominer or pycytominer-eval processing
        """
        if not hasattr(self, "file_aggregate"):
            self.setup_aggregate()

        self.aggregated_profiles = []
        self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

        # Iterates over all sites, wells or plates
        for metadata_level in self.file_aggregate:
            # uses custom load function to create df with metadata and profiles
            arr = [
                load_npz_features(x)
                for x in self.file_aggregate[metadata_level]["files"]
            ]
            # empty dataframes from missing files are deleted
            arr = [x for x in arr if not x.empty]
            # if no files were found there is a miss-match between the index and the output files
            if not len(arr):
                warnings.warn(
                    f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!"
                )
                continue
            df = pd.concat(arr)

            # extract metadata prior to aggregation
            meta_df = pd.DataFrame()
            metadata_cols = infer_cp_features(df, metadata=True)
            profiles = [x for x in df.columns.tolist() if x not in metadata_cols]

            # If all rows have the same Metadata information, that value is valid for the aggregated df
            for col in metadata_cols:
                if len(df[col].unique()) == 1:
                    meta_df[col] = [df[col].unique()[0]]

            # perform the aggregation
            df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
            df = aggregate.aggregate(
                population_df=df,
                strata="Metadata_Aggregate_On",
                features=profiles,
                operation=self.aggregate_operation,
            ).reset_index(drop=True)

            # add the aggregation level as a column
            df.loc[:, self.aggregate_merge_col] = metadata_level
            # concatenate the metadata back onto the aggregated profile
            df = pd.concat([df, meta_df], axis=1)

            # save metalevel file
            if self.output_file is not None:
                if not os.path.exists(self.output_file):
                    os.mkdir(self.output_file)
                file_path = os.path.join(
                    self.output_file, metadata_level.replace("/", "_")
                )
                df.to_csv(f"{file_path}.csv", index=False)
            self.aggregated_profiles.append(df)

        # Concatenate all of the above created profiles
        self.aggregated_profiles = pd.concat(
            list(self.aggregated_profiles)
        ).reset_index(drop=True)

        # clean and reindex columns
        self.aggregated_profiles.columns = [
            str(x) for x in self.aggregated_profiles.columns
        ]
        meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
        reindex_profiles = [str(x) for x in profiles]
        self.aggregated_profiles = self.aggregated_profiles.reindex(
            meta_features + reindex_profiles, axis="columns"
        )

        # If Columns have NaN values from concatenation, drop these
        self.aggregated_profiles.dropna(axis="columns", inplace=True)

        df_out = self.aggregated_profiles
        return df_out

__init__(deep_data, aggregate_operation='median', aggregate_on='well', output_file=None)

init function for this class.

Arguments

See above for all parameters.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def __init__(
    self,
    deep_data: DeepProfilerData,
    aggregate_operation="median",
    aggregate_on="well",
    output_file=None,
):
    """
    __init__ function for this class.

    Arguments
    ---------
    See above for all parameters.
    """
    assert aggregate_operation in [  # noqa: S101
        "median",
        "mean",
    ], "Input of aggregate_operation is incorrect, it must be either median or mean"
    assert aggregate_on in [  # noqa: S101
        "site",
        "well",
        "plate",
    ], "Input of aggregate_on is incorrect, it must be either site or well or plate"

    self.deep_data = deep_data
    self.aggregate_operation = aggregate_operation
    self.aggregate_on = aggregate_on
    self.output_file = output_file

aggregate_deep()

Aggregate the DeepProfiler profiles into a pandas dataframe.

For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated. If files are missing, we throw a warning but continue the code. After aggregation, the metadata is concatenated back onto the dataframe.

Returns:

Name Type Description
df_out dataframe

dataframe with all metadata and the feature space. This is the input to any further pycytominer or pycytominer-eval processing

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def aggregate_deep(self):
    """
    Aggregate the DeepProfiler profiles into a pandas dataframe.

    For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated.
    If files are missing, we throw a warning but continue the code.
    After aggregation, the metadata is concatenated back onto the dataframe.

    Returns
    -------
    df_out : pandas.dataframe
        dataframe with all metadata and the feature space.
        This is the input to any further pycytominer or pycytominer-eval processing
    """
    if not hasattr(self, "file_aggregate"):
        self.setup_aggregate()

    self.aggregated_profiles = []
    self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

    # Iterates over all sites, wells or plates
    for metadata_level in self.file_aggregate:
        # uses custom load function to create df with metadata and profiles
        arr = [
            load_npz_features(x)
            for x in self.file_aggregate[metadata_level]["files"]
        ]
        # empty dataframes from missing files are deleted
        arr = [x for x in arr if not x.empty]
        # if no files were found there is a miss-match between the index and the output files
        if not len(arr):
            warnings.warn(
                f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!"
            )
            continue
        df = pd.concat(arr)

        # extract metadata prior to aggregation
        meta_df = pd.DataFrame()
        metadata_cols = infer_cp_features(df, metadata=True)
        profiles = [x for x in df.columns.tolist() if x not in metadata_cols]

        # If all rows have the same Metadata information, that value is valid for the aggregated df
        for col in metadata_cols:
            if len(df[col].unique()) == 1:
                meta_df[col] = [df[col].unique()[0]]

        # perform the aggregation
        df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
        df = aggregate.aggregate(
            population_df=df,
            strata="Metadata_Aggregate_On",
            features=profiles,
            operation=self.aggregate_operation,
        ).reset_index(drop=True)

        # add the aggregation level as a column
        df.loc[:, self.aggregate_merge_col] = metadata_level
        # concatenate the metadata back onto the aggregated profile
        df = pd.concat([df, meta_df], axis=1)

        # save metalevel file
        if self.output_file is not None:
            if not os.path.exists(self.output_file):
                os.mkdir(self.output_file)
            file_path = os.path.join(
                self.output_file, metadata_level.replace("/", "_")
            )
            df.to_csv(f"{file_path}.csv", index=False)
        self.aggregated_profiles.append(df)

    # Concatenate all of the above created profiles
    self.aggregated_profiles = pd.concat(
        list(self.aggregated_profiles)
    ).reset_index(drop=True)

    # clean and reindex columns
    self.aggregated_profiles.columns = [
        str(x) for x in self.aggregated_profiles.columns
    ]
    meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
    reindex_profiles = [str(x) for x in profiles]
    self.aggregated_profiles = self.aggregated_profiles.reindex(
        meta_features + reindex_profiles, axis="columns"
    )

    # If Columns have NaN values from concatenation, drop these
    self.aggregated_profiles.dropna(axis="columns", inplace=True)

    df_out = self.aggregated_profiles
    return df_out

setup_aggregate()

Set up the file_aggregate attribute.

A helper function to aggregate_deep that aggregate the file_aggregate dictionary contains the file locations and metadata for each grouping. If for example we are grouping by well then the keys of self.file_aggregate would be: plate1/well1, plate1/well2, plate2/well1, etc.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def setup_aggregate(self):
    """Set up the file_aggregate attribute.

    A helper function to `aggregate_deep` that aggregate the file_aggregate dictionary contains the file locations and metadata for each grouping.
    If for example we are grouping by well then the keys of self.file_aggregate would be:
    plate1/well1, plate1/well2, plate2/well1, etc.
    """
    if not hasattr(self.deep_data, "filenames"):
        self.deep_data.build_filenames()

    self.file_aggregate = {}
    for filename in self.deep_data.filenames:
        file_info = self.deep_data.extract_filename_metadata(
            filename, self.deep_data.filename_delimiter
        )
        file_key = file_info[self.aggregate_on]

        if self.aggregate_on == "site":
            file_key = (
                f"{file_info['plate']}/{file_info['well']}_{file_info['site']}"
            )

        if self.aggregate_on == "well":
            file_key = f"{file_info['plate']}/{file_info['well']}"

        if file_key in self.file_aggregate:
            self.file_aggregate[file_key]["files"].append(filename)
        else:
            self.file_aggregate[file_key] = {}
            self.file_aggregate[file_key]["files"] = [filename]

        self.file_aggregate[file_key]["metadata"] = file_info

DeepProfilerData

Class that holds all functions needed to load and annotate the DeepProfiler (DP) run.

Attributes:

Name Type Description
profile_dir str

file location of the output profiles from DeepProfiler (e.g. /project1/outputs/results/features/)

filename_delimiter default = '_'

delimiter for the filenames of the profiles (e.g. B02_4.npz).

file_extension default = '.npz'

extension of the profile file.

index_df DataFrame

load in the index.csv file from DeepProfiler, provided by an input index file.

filenames list of paths

list of Purepaths that point to the npz files.

Methods:

Name Description
build_filenames

build filenames from index_df

extract_filename_metadata

get site, well, plate info for npz file

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class DeepProfilerData:
    """Class that holds all functions needed to load and annotate the DeepProfiler (DP) run.

    Attributes
    ----------
    profile_dir : str
        file location of the output profiles from DeepProfiler
        (e.g. `/project1/outputs/results/features/`)
    filename_delimiter : default = '_'
        delimiter for the filenames of the profiles (e.g. B02_4.npz).
    file_extension : default = '.npz'
        extension of the profile file.
    index_df : pandas.DataFrame
        load in the index.csv file from DeepProfiler, provided by an input index file.
    filenames : list of paths
        list of Purepaths that point to the npz files.

    Methods
    -------
    build_filenames()
        build filenames from index_df
    extract_filename_metadata(npz_file, delimiter="_")
        get site, well, plate info for npz file
    """

    def __init__(
        self,
        index_file,
        profile_dir,
        filename_delimiter="_",
        file_extension=".npz",
    ):
        """
        __init__ function for this class.

        Arguments
        ---------
        index_file : str
            file location of the index.csv from DP

        See above for all other parameters.
        """
        self.index_df = pd.read_csv(index_file, dtype=str)
        self.profile_dir = profile_dir
        self.filename_delimiter = filename_delimiter
        self.file_extension = file_extension
        if not self.file_extension.startswith("."):
            self.file_extension = f".{self.file_extension}"

    def build_filenames(self):
        """Create file names indicated by plate, well, and site information."""
        self.filenames = self.index_df.apply(
            self.build_filename_from_index, axis="columns"
        )
        self.filenames = [
            pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
        ]

    def build_filename_from_index(self, row):
        """Build the name of the profile files."""
        plate = row["Metadata_Plate"]
        well = row["Metadata_Well"]
        site = row["Metadata_Site"]

        filename = f"{plate}/{well}{self.filename_delimiter}{site}{self.file_extension}"
        return filename

    def extract_filename_metadata(self, npz_file, delimiter="_"):
        """Extract metadata (site, well and plate) from the filename.

        This function is used to extract the metadata from the filename of the npz files.
        It expects a naming convetion of path/plate/well{delimiter}site.npz.

        Arguments
        ---------
        npz_file : str
            file path

        delimiter : str
            the delimiter used in the naming convention of the files. default = '_'

        Returns
        -------
        loc : dict
            dict with metadata
        """
        if delimiter == "/":
            site = str(npz_file).split("/")[-1].strip(".npz")
            well = str(npz_file).split("/")[-2]
        else:
            base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
            site = base_file[-1]
            well = base_file[-2]
        plate = str(npz_file).split("/")[-2]

        loc = {"site": site, "well": well, "plate": plate}
        return loc

__init__(index_file, profile_dir, filename_delimiter='_', file_extension='.npz')

init function for this class.

Arguments

index_file : str file location of the index.csv from DP

See above for all other parameters.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    index_file,
    profile_dir,
    filename_delimiter="_",
    file_extension=".npz",
):
    """
    __init__ function for this class.

    Arguments
    ---------
    index_file : str
        file location of the index.csv from DP

    See above for all other parameters.
    """
    self.index_df = pd.read_csv(index_file, dtype=str)
    self.profile_dir = profile_dir
    self.filename_delimiter = filename_delimiter
    self.file_extension = file_extension
    if not self.file_extension.startswith("."):
        self.file_extension = f".{self.file_extension}"

build_filename_from_index(row)

Build the name of the profile files.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
75
76
77
78
79
80
81
82
def build_filename_from_index(self, row):
    """Build the name of the profile files."""
    plate = row["Metadata_Plate"]
    well = row["Metadata_Well"]
    site = row["Metadata_Site"]

    filename = f"{plate}/{well}{self.filename_delimiter}{site}{self.file_extension}"
    return filename

build_filenames()

Create file names indicated by plate, well, and site information.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
66
67
68
69
70
71
72
73
def build_filenames(self):
    """Create file names indicated by plate, well, and site information."""
    self.filenames = self.index_df.apply(
        self.build_filename_from_index, axis="columns"
    )
    self.filenames = [
        pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
    ]

extract_filename_metadata(npz_file, delimiter='_')

Extract metadata (site, well and plate) from the filename.

This function is used to extract the metadata from the filename of the npz files. It expects a naming convetion of path/plate/well{delimiter}site.npz.

Arguments

npz_file : str file path

delimiter : str the delimiter used in the naming convention of the files. default = '_'

Returns:

Name Type Description
loc dict

dict with metadata

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def extract_filename_metadata(self, npz_file, delimiter="_"):
    """Extract metadata (site, well and plate) from the filename.

    This function is used to extract the metadata from the filename of the npz files.
    It expects a naming convetion of path/plate/well{delimiter}site.npz.

    Arguments
    ---------
    npz_file : str
        file path

    delimiter : str
        the delimiter used in the naming convention of the files. default = '_'

    Returns
    -------
    loc : dict
        dict with metadata
    """
    if delimiter == "/":
        site = str(npz_file).split("/")[-1].strip(".npz")
        well = str(npz_file).split("/")[-2]
    else:
        base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
        site = base_file[-1]
        well = base_file[-2]
    plate = str(npz_file).split("/")[-2]

    loc = {"site": site, "well": well, "plate": plate}
    return loc

SingleCellDeepProfiler

Class that holds functions needed to analyze single cells from the DeepProfiler (DP) run.

Only pycytominer.normalization() is implemented.

Attributes:

Name Type Description
deep_data DeepProfilerData

DeepProfilerData object to load data from DeepProfiler project

aggregated_profiles DataFrame

df to hold the metadata and profiles.

file_aggregate dict

dict that holds the file names and metadata. Is used to load in the npz files in the correct order and grouping.

output_file str

If provided, will write annotated profiles to folder. Defaults to None.

Methods:

Name Description
normalize
float_format, mad_robustize_epsilon, spherize_center, spherize_method, spherize_epsilon)

normalize profiling features from DeepProfiler run with pycytominer.normalize()

Example

import pathlib from pycytominer.cyto_utils import DeepProfiler_processing

index_file = pathlib.Path("path/to/index.csv") profile_dir = pathlib.Path("path/to/features/")

deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz") deep_single_cell = DeepProfiler_processing.SingleCellDeepProfiler(deep_data) normalized = deep_single_cell.normalize_deep_single_cells()

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
class SingleCellDeepProfiler:
    """Class that holds functions needed to analyze single cells from the DeepProfiler (DP) run.

    Only pycytominer.normalization() is implemented.

    Attributes
    ----------
    deep_data : DeepProfilerData
        DeepProfilerData object to load data from DeepProfiler project
    aggregated_profiles : pandas.DataFrame
        df to hold the metadata and profiles.
    file_aggregate : dict
        dict that holds the file names and metadata.
        Is used to load in the npz files in the correct order and grouping.
    output_file : str
        If provided, will write annotated profiles to folder. Defaults to None.

    Methods
    -------
    normalize(profiles, features, image_features, meta_features, samples, method, output_file, compression_options,
    float_format, mad_robustize_epsilon, spherize_center, spherize_method, spherize_epsilon)
        normalize profiling features from DeepProfiler run with pycytominer.normalize()

    Example
    -------
    import pathlib
    from pycytominer.cyto_utils import DeepProfiler_processing

    index_file = pathlib.Path("path/to/index.csv")
    profile_dir = pathlib.Path("path/to/features/")

    deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz")
    deep_single_cell = DeepProfiler_processing.SingleCellDeepProfiler(deep_data)
    normalized = deep_single_cell.normalize_deep_single_cells()
    """

    def __init__(
        self,
        deep_data: DeepProfilerData,
    ):
        """
        __init__ function for this class.

        Arguments
        ---------
        See above for all parameters.
        """
        self.deep_data = deep_data

    def get_single_cells(
        self, output=False, location_x_col_index=0, location_y_col_index=1
    ):
        """Set up a single_cells dataframe in the format expected by pycytominer.normalize().

        Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe.

        Arguments
        -----------
        output : bool
            If true, will output the single cell dataframe instead of setting to self attribute
        location_x_col_index: int
            index of the x location column (which column in DP output has X coords)
        location_y_col_index: int
            index of the y location column (which column in DP output has Y coords)
        """
        # build filenames if they do not already exist
        if not hasattr(self.deep_data, "filenames"):
            self.deep_data.build_filenames()

        # compile features dataframe with single cell locations
        total_df = []
        for features_path in self.deep_data.filenames:
            features = load_npz_features(features_path)
            # skip a file if there are no features
            if len(features.index) == 0:
                warnings.warn(
                    f"No features could be found at {features_path}.\nThis program will continue, but be aware that this might induce errors!"
                )
                continue
            locations = load_npz_locations(
                features_path, location_x_col_index, location_y_col_index
            )
            detailed_df = pd.concat([locations, features], axis=1)

            total_df.append(detailed_df)

        sc_df = pd.concat(total_df).reset_index(drop=True)
        if output:
            return sc_df
        else:
            self.single_cells = sc_df

    def normalize_deep_single_cells(
        self,
        location_x_col_index=0,
        location_y_col_index=1,
        image_features=False,  # not implemented with DeepProfiler
        meta_features="infer",
        samples="all",
        method="standardize",
        output_file=None,
        compression_options=None,
        float_format=None,
        mad_robustize_epsilon=1e-18,
        spherize_center=True,
        spherize_method="ZCA-cor",
        spherize_epsilon=1e-6,
    ):
        """
        Normalize all cells into a pandas dataframe.

        For each file in the DP project features folder, the features from each cell are loaded.
        These features are put into a profiles dataframe for use in pycytominer.normalize.
        A features list is also compiled for use in pycytominer.normalize.

        Returns
        -------
        df_out : pandas.dataframe
            dataframe with all metadata and the feature space.
            This is the input to any further pycytominer or pycytominer-eval processing
        """
        print("getting single cells")
        # setup single_cells attribute
        if not hasattr(self, "single_cells"):
            self.get_single_cells(
                output=False,
                location_x_col_index=location_x_col_index,
                location_y_col_index=location_y_col_index,
            )

        # extract metadata prior to normalization
        metadata_cols = infer_cp_features(self.single_cells, metadata=True)
        # locations are not automatically inferred with cp features
        metadata_cols.append("Location_Center_X")
        metadata_cols.append("Location_Center_Y")
        derived_features = [
            x for x in self.single_cells.columns.tolist() if x not in metadata_cols
        ]

        # wrapper for pycytominer.normalize() function
        normalized = normalize.normalize(
            profiles=self.single_cells,
            features=derived_features,
            image_features=image_features,
            meta_features=meta_features,
            samples=samples,
            method=method,
            output_file=None,
            compression_options=compression_options,
            float_format=float_format,
            mad_robustize_epsilon=mad_robustize_epsilon,
            spherize_center=spherize_center,
            spherize_method=spherize_method,
            spherize_epsilon=spherize_epsilon,
        )

        # move x locations and y locations to metadata columns of normalized df
        x_locations = self.single_cells["Location_Center_X"]
        normalized.insert(0, "Location_Center_X", x_locations)
        y_locations = self.single_cells["Location_Center_Y"]
        normalized.insert(1, "Location_Center_Y", y_locations)

        # separate code because normalize() will not return if it has an output file specified
        if output_file is not None:
            output(
                df=normalized,
                output_filename=output_file,
                compression_options=compression_options,
                float_format=float_format,
            )

        return normalized

__init__(deep_data)

init function for this class.

Arguments

See above for all parameters.

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
341
342
343
344
345
346
347
348
349
350
351
352
def __init__(
    self,
    deep_data: DeepProfilerData,
):
    """
    __init__ function for this class.

    Arguments
    ---------
    See above for all parameters.
    """
    self.deep_data = deep_data

get_single_cells(output=False, location_x_col_index=0, location_y_col_index=1)

Set up a single_cells dataframe in the format expected by pycytominer.normalize().

Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe.

Arguments

output : bool If true, will output the single cell dataframe instead of setting to self attribute location_x_col_index: int index of the x location column (which column in DP output has X coords) location_y_col_index: int index of the y location column (which column in DP output has Y coords)

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def get_single_cells(
    self, output=False, location_x_col_index=0, location_y_col_index=1
):
    """Set up a single_cells dataframe in the format expected by pycytominer.normalize().

    Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe.

    Arguments
    -----------
    output : bool
        If true, will output the single cell dataframe instead of setting to self attribute
    location_x_col_index: int
        index of the x location column (which column in DP output has X coords)
    location_y_col_index: int
        index of the y location column (which column in DP output has Y coords)
    """
    # build filenames if they do not already exist
    if not hasattr(self.deep_data, "filenames"):
        self.deep_data.build_filenames()

    # compile features dataframe with single cell locations
    total_df = []
    for features_path in self.deep_data.filenames:
        features = load_npz_features(features_path)
        # skip a file if there are no features
        if len(features.index) == 0:
            warnings.warn(
                f"No features could be found at {features_path}.\nThis program will continue, but be aware that this might induce errors!"
            )
            continue
        locations = load_npz_locations(
            features_path, location_x_col_index, location_y_col_index
        )
        detailed_df = pd.concat([locations, features], axis=1)

        total_df.append(detailed_df)

    sc_df = pd.concat(total_df).reset_index(drop=True)
    if output:
        return sc_df
    else:
        self.single_cells = sc_df

normalize_deep_single_cells(location_x_col_index=0, location_y_col_index=1, image_features=False, meta_features='infer', samples='all', method='standardize', output_file=None, compression_options=None, float_format=None, mad_robustize_epsilon=1e-18, spherize_center=True, spherize_method='ZCA-cor', spherize_epsilon=1e-06)

Normalize all cells into a pandas dataframe.

For each file in the DP project features folder, the features from each cell are loaded. These features are put into a profiles dataframe for use in pycytominer.normalize. A features list is also compiled for use in pycytominer.normalize.

Returns:

Name Type Description
df_out dataframe

dataframe with all metadata and the feature space. This is the input to any further pycytominer or pycytominer-eval processing

Source code in pycytominer/cyto_utils/DeepProfiler_processing.py
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
def normalize_deep_single_cells(
    self,
    location_x_col_index=0,
    location_y_col_index=1,
    image_features=False,  # not implemented with DeepProfiler
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file=None,
    compression_options=None,
    float_format=None,
    mad_robustize_epsilon=1e-18,
    spherize_center=True,
    spherize_method="ZCA-cor",
    spherize_epsilon=1e-6,
):
    """
    Normalize all cells into a pandas dataframe.

    For each file in the DP project features folder, the features from each cell are loaded.
    These features are put into a profiles dataframe for use in pycytominer.normalize.
    A features list is also compiled for use in pycytominer.normalize.

    Returns
    -------
    df_out : pandas.dataframe
        dataframe with all metadata and the feature space.
        This is the input to any further pycytominer or pycytominer-eval processing
    """
    print("getting single cells")
    # setup single_cells attribute
    if not hasattr(self, "single_cells"):
        self.get_single_cells(
            output=False,
            location_x_col_index=location_x_col_index,
            location_y_col_index=location_y_col_index,
        )

    # extract metadata prior to normalization
    metadata_cols = infer_cp_features(self.single_cells, metadata=True)
    # locations are not automatically inferred with cp features
    metadata_cols.append("Location_Center_X")
    metadata_cols.append("Location_Center_Y")
    derived_features = [
        x for x in self.single_cells.columns.tolist() if x not in metadata_cols
    ]

    # wrapper for pycytominer.normalize() function
    normalized = normalize.normalize(
        profiles=self.single_cells,
        features=derived_features,
        image_features=image_features,
        meta_features=meta_features,
        samples=samples,
        method=method,
        output_file=None,
        compression_options=compression_options,
        float_format=float_format,
        mad_robustize_epsilon=mad_robustize_epsilon,
        spherize_center=spherize_center,
        spherize_method=spherize_method,
        spherize_epsilon=spherize_epsilon,
    )

    # move x locations and y locations to metadata columns of normalized df
    x_locations = self.single_cells["Location_Center_X"]
    normalized.insert(0, "Location_Center_X", x_locations)
    y_locations = self.single_cells["Location_Center_Y"]
    normalized.insert(1, "Location_Center_Y", y_locations)

    # separate code because normalize() will not return if it has an output file specified
    if output_file is not None:
        output(
            df=normalized,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )

    return normalized

pycytominer.cyto_utils.annotate_custom

Functions to annotate data frames with custom options according to CMAP specifications.

annotate_cmap(annotated, annotate_join_on, cell_id='unknown', perturbation_mode='none')

Annotates data frame with custom options according to CMAP specifications.

Parameters:

Name Type Description Default
annotated DataFrame

DataFrame of profiles.

required
annotate_join_on str

Typically the well metadata, but how to join external data

required
cell_id str

provide a string to annotate cell id column

"unknown"
perturbation_mode str

How to annotate CMAP specific data (options = ["chemical" , "genetic"])

"none"

Returns:

Type Description
annotated

CMAP annotated data

Source code in pycytominer/cyto_utils/annotate_custom.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def annotate_cmap(
    annotated, annotate_join_on, cell_id="unknown", perturbation_mode="none"
):
    """Annotates data frame with custom options according to CMAP specifications.

    Parameters
    ----------
    annotated : pandas.core.frame.DataFrame
        DataFrame of profiles.
    annotate_join_on : str
        Typically the well metadata, but how to join external data
    cell_id : str, default "unknown"
        provide a string to annotate cell id column
    perturbation_mode : str, default "none"
        How to annotate CMAP specific data (options = ["chemical" , "genetic"])

    Returns
    -------
    annotated
        CMAP annotated data
    """
    pert_opts = ["none", "chemical", "genetic"]
    assert (  # noqa: S101
        perturbation_mode in pert_opts
    ), f"perturbation mode must be one of {pert_opts}"

    assert (  # noqa: S101
        "Metadata_broad_sample" in annotated.columns
    ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'"

    annotated = annotated.assign(
        Metadata_pert_id=annotated.Metadata_broad_sample.str.extract(
            r"(BRD[-N][A-Z0-9]+)"
        ),
        Metadata_pert_mfc_id=annotated.Metadata_broad_sample,
        Metadata_pert_well=annotated.loc[:, annotate_join_on],
        Metadata_pert_id_vendor="",
    )

    if "Metadata_pert_iname" in annotated.columns:
        annotated = annotated.assign(
            Metadata_pert_mfc_desc=annotated.Metadata_pert_iname,
            Metadata_pert_name=annotated.Metadata_pert_iname,
        )

    if "Metadata_cell_id" not in annotated.columns:
        annotated = annotated.assign(Metadata_cell_id=cell_id)

    if perturbation_mode == "chemical":
        annotated = annotated.assign(
            Metadata_broad_sample_type=[
                "control" if x in ["DMSO", np.nan] else "trt"
                for x in annotated.Metadata_broad_sample
            ]
        )

        # Generate Metadata_broad_sample column
        annotated.loc[
            annotated.Metadata_broad_sample_type == "control",
            "Metadata_broad_sample",
        ] = "DMSO"
        annotated.loc[
            annotated.Metadata_broad_sample == "empty", "Metadata_broad_sample_type"
        ] = "empty"

        if "Metadata_mmoles_per_liter" in annotated.columns:
            annotated.loc[
                annotated.Metadata_broad_sample_type == "control",
                "Metadata_mmoles_per_liter",
            ] = 0

        if "Metadata_solvent" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_vehicle=annotated.Metadata_solvent
            )
        if "Metadata_mg_per_ml" in annotated.columns:
            annotated.loc[
                annotated.Metadata_broad_sample_type == "control",
                "Metadata_mg_per_ml",
            ] = 0

    if perturbation_mode == "genetic" and "Metadata_pert_name" in annotated.columns:
        annotated = annotated.assign(
            Metadata_broad_sample_type=[
                "control" if x == "EMPTY" else "trt"
                for x in annotated.Metadata_pert_name
            ]
        )

    if "Metadata_broad_sample_type" in annotated.columns:
        annotated = annotated.assign(
            Metadata_pert_type=annotated.Metadata_broad_sample_type
        )
    else:
        annotated = annotated.assign(
            Metadata_pert_type="", Metadata_broad_sample_type=""
        )

    return annotated

cp_clean(profiles)

Specifically clean certain column names derived from different CellProfiler versions.

Parameters:

Name Type Description Default
profiles DataFrame

DataFrame of profiles.

required

Returns:

Type Description
profiles

Renamed to standard metadata

Source code in pycytominer/cyto_utils/annotate_custom.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def cp_clean(profiles):
    """Specifically clean certain column names derived from different CellProfiler versions.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame
        DataFrame of profiles.

    Returns
    -------
    profiles
        Renamed to standard metadata
    """
    profiles = profiles.rename(
        {
            "Image_Metadata_Plate": "Metadata_Plate",
            "Image_Metadata_Well": "Metadata_Well",
        },
        axis="columns",
    )

    return profiles

pycytominer.cyto_utils.cell_locations

Utility function to augment a metadata file with X,Y locations of cells in each image.

CellLocation

Class holding all the functions augment a metadata file with X,Y locations of cells in each image.

In the metadata file, which is either a CSV or a Parquet file, - Each row is single multi-channel image - Each image is indexed by multiple columns, e.g., Metadata_Plate, Metadata_Well,Metadata_Site

The single_cell SQLite file contains at least two tables - Nuclei, which has the single-cell-level readouts, including location information - Image, which has the image-level readouts, as well metadata to link to the metadata file

In the Nuclei table, - Each row is a cell - Each cell has at least 3 columns: Nuclei_Location_Center_X, Nuclei_Location_Center_Y, ImageNumber

In the Image table, - Each row is an image - Each image has at least the same columns as the images in the metadata file are indexed by, e.g., Metadata_Plate,Metadata_Well,Metadata_Site

The methods in this class do the following - Read the metadata file - Read the single_cell file - For each image in the metadata file, find the corresponding image in the single_cell file - For each cell in the corresponding image, find the X,Y location - Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column

Attributes:

Name Type Description
metadata_input str or Pandas DataFrame

Path to the input metadata file or a Pandas DataFrame

single_cell_input str or Engine

Path to the single_cell file or a sqlalchemy.engine.Engine object

augmented_metadata_output str

Path to the output file. If None, the metadata file is not saved to disk

image_column default = 'ImageNumber'

Name of the column in the metadata file that links to the single_cell file, in combination with table_column

image_key default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site']

Names of the columns in the metadata file that uniquely identify each image

object_column default = 'ObjectNumber'

Name of the column in the single_cell file that identifies each cell

cell_x_loc default = 'Nuclei_Location_Center_X'

Name of the column in the single_cell file that contains the X location of each cell

cell_y_loc default = 'Nuclei_Location_Center_Y'

Name of the column in the single_cell file that contains the Y location of each cell

table_column default = 'TableNumber'

Name of the column in the metadata file that links to the single_cell file, in combination with image_column

Methods:

Name Description
add_cell_location

Augment the metadata file and optionally save it to a file

Source code in pycytominer/cyto_utils/cell_locations.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
class CellLocation:
    """Class holding all the functions augment a metadata file with X,Y locations of cells in each image.

    In the metadata file, which is either a CSV or a Parquet file,
    - Each row is single multi-channel image
    - Each image is indexed by multiple columns, e.g., `Metadata_Plate`, `Metadata_Well`,`Metadata_Site`

    The single_cell SQLite file contains at least two tables
    - `Nuclei`, which has the single-cell-level readouts, including location information
    - `Image`, which has the image-level readouts, as well metadata to link to the metadata file

    In the `Nuclei` table,
    - Each row is a cell
    - Each cell has at least 3 columns: `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`, `ImageNumber`

    In the `Image` table,
    - Each row is an image
    - Each image has at least the same columns as the images in the metadata file are indexed by, e.g., `Metadata_Plate`,`Metadata_Well`,`Metadata_Site`

    The methods in this class do the following
    - Read the metadata file
    - Read the single_cell file
    - For each image in the metadata file, find the corresponding image in the single_cell file
    - For each cell in the corresponding image, find the X,Y location
    - Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column


    Attributes
    ----------
    metadata_input : str or Pandas DataFrame
        Path to the input metadata file or a Pandas DataFrame

    single_cell_input : str or sqlalchemy.engine.Engine
        Path to the single_cell file or a sqlalchemy.engine.Engine object

    augmented_metadata_output : str
        Path to the output file. If None, the metadata file is not saved to disk

    image_column : default = 'ImageNumber'
        Name of the column in the metadata file that links to the single_cell file, in combination with `table_column`

    image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site']
        Names of the columns in the metadata file that uniquely identify each image

    object_column : default = 'ObjectNumber'
        Name of the column in the single_cell file that identifies each cell

    cell_x_loc : default = 'Nuclei_Location_Center_X'
        Name of the column in the single_cell file that contains the X location of each cell

    cell_y_loc : default = 'Nuclei_Location_Center_Y'
        Name of the column in the single_cell file that contains the Y location of each cell

    table_column : default = 'TableNumber'
        Name of the column in the metadata file that links to the single_cell file, in combination with `image_column`

    Methods
    -------
    add_cell_location()
        Augment the metadata file and optionally save it to a file

    """

    def __init__(
        self,
        metadata_input: Union[str, pd.DataFrame],
        single_cell_input: Union[str, sqlalchemy.engine.Engine],
        augmented_metadata_output: Optional[str] = None,
        overwrite: bool = False,
        image_column: str = "ImageNumber",
        object_column: str = "ObjectNumber",
        table_column: str = "TableNumber",
        image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
        cell_x_loc: str = "Nuclei_Location_Center_X",
        cell_y_loc: str = "Nuclei_Location_Center_Y",
    ):
        self.metadata_input = self._expanduser(metadata_input)
        self.augmented_metadata_output = self._expanduser(augmented_metadata_output)
        self.single_cell_input = self._expanduser(single_cell_input)
        self.overwrite = overwrite
        self.image_column = image_column
        self.object_column = object_column
        self.table_column = table_column
        self.image_key = image_key
        self.cell_x_loc = cell_x_loc
        self.cell_y_loc = cell_y_loc
        # Currently constrained to only anonymous access for S3 resources
        # https://github.com/cytomining/pycytominer/issues/268
        self.s3 = boto3.client(
            "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
        )

    def _expanduser(self, obj: Union[str, None]):
        """Expand the user home directory in a path."""
        if obj is not None and isinstance(obj, str) and not obj.startswith("s3://"):
            return pathlib.Path(obj).expanduser().as_posix()
        return obj

    def _parse_s3_path(self, s3_path: str):
        """Parse an S3 path into a bucket and key.

        Parameters
        ----------
        s3_path : str
            The S3 path

        Returns
        -------
        str
            The bucket
        str
            The key
        """
        s3_path = s3_path.replace("s3://", "")

        bucket = s3_path.split("/")[0]

        key = "/".join(s3_path.split("/")[1:])

        return bucket, key

    def _s3_file_exists(self, s3_path: str):
        """Check if a file exists on S3.

        Parameters
        ----------
        s3_path : str
            The path to the file on S3

        Returns
        -------
        bool
            True if the file exists on S3, False otherwise
        """
        bucket, key = self._parse_s3_path(s3_path)

        try:
            self.s3.head_object(Bucket=bucket, Key=key)
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] in ["404", "400", "403"]:
                return False
            else:
                raise
        else:
            return True

    def _download_s3(self, uri: str):
        """Download a file from S3 to a temporary file and return the temporary path."""
        bucket, key = self._parse_s3_path(uri)

        tmp_file = tempfile.NamedTemporaryFile(
            delete=False, suffix=pathlib.Path(key).name
        )

        self.s3.download_file(bucket, key, tmp_file.name)

        return tmp_file.name

    def _load_metadata(self):
        """Load the metadata into a Pandas DataFrame.

        Returns
        -------
        Pandas DataFrame
            The metadata loaded into a Pandas DataFrame
        """
        if not isinstance(self.metadata_input, pd.DataFrame):
            # verify that the metadata file is a CSV or a Parquet file

            if not (
                self.metadata_input.endswith(".csv")
                or self.metadata_input.endswith(".parquet")
            ):
                raise ValueError("Metadata file must be a CSV or a Parquet file")

            storage_options = (
                {"anon": True} if self.metadata_input.startswith("s3://") else None
            )

            # load the metadata file into a Pandas DataFrame
            if self.metadata_input.endswith(".csv"):
                df = pd.read_csv(
                    self.metadata_input, dtype=str, storage_options=storage_options
                )
            else:
                df = pd.read_parquet(
                    self.metadata_input, storage_options=storage_options
                )

            # cast all columns to string
            df = df.astype(str)
        else:
            df = self.metadata_input

        # verify that the image index columns are present in the metadata object

        if not all(elem in df.columns for elem in self.image_key):
            raise ValueError(
                f"Image index columns {self.image_key} are not present in the metadata file"
            )

        return df

    def _create_nested_df(self, df: pd.DataFrame):
        """Create a new column `CellCenters` by nesting the X and Y locations of cell from an image into the row of the image.

        Parameters
        ----------
        df : Pandas DataFrame
            The DataFrame to convert

        Returns
        -------
        Pandas DataFrame
        """
        # define a dictionary to store the output
        output_df_list = collections.defaultdict(list)

        # iterate over each group of cells in the merged DataFrame
        group_cols = [*self.image_key, self.image_column, self.table_column]

        for group_values, cell_df in df.groupby(group_cols):
            # add the image-level information to the output dictionary
            for key, value in zip(group_cols, group_values):
                output_df_list[key].append(value)

            # convert the cell DataFrame to a dictionary
            cell_dict = cell_df.to_dict(orient="list")

            # iterate over each cell in the cell DataFrame
            row_cell_dicts = []
            for object_column, cell_x_loc, cell_y_loc in zip(
                cell_dict[self.object_column],
                cell_dict[self.cell_x_loc],
                cell_dict[self.cell_y_loc],
            ):
                # add the cell information to a dictionary
                row_cell_dicts.append({
                    self.object_column: object_column,
                    self.cell_x_loc: cell_x_loc,
                    self.cell_y_loc: cell_y_loc,
                })

            # add the cell-level information to the output dictionary
            output_df_list["CellCenters"].append(row_cell_dicts)

        # convert the output dictionary to a Pandas DataFrame
        return pd.DataFrame(output_df_list)

    def _get_single_cell_engine(self):
        """Get the sqlalchemy.engine.Engine object for the single_cell file."""
        if isinstance(self.single_cell_input, str):
            # check if the single_cell file is a SQLite file
            if not self.single_cell_input.endswith(".sqlite"):
                raise ValueError("single_cell file must be a SQLite file")

            # if the single_cell file is an S3 path, download it to a temporary file
            if self.single_cell_input.startswith("s3://"):
                temp_single_cell_input = self._download_s3(self.single_cell_input)

                # connect to the single_cell file
                engine = sqlalchemy.create_engine(f"sqlite:///{temp_single_cell_input}")
            else:
                # connect to the single_cell file
                engine = sqlalchemy.create_engine(f"sqlite:///{self.single_cell_input}")
                temp_single_cell_input = None

        else:
            engine = self.single_cell_input
            temp_single_cell_input = None

        return temp_single_cell_input, engine

    def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):
        """Check that the single_cell file has the required tables and columns."""
        inspector = sqlalchemy.inspect(engine)

        if not all(
            table_name in inspector.get_table_names()
            for table_name in ["Image", "Nuclei"]
        ):
            raise ValueError(
                "Image and Nuclei tables are not present in the single_cell file"
            )

        # Verify that the required columns are present in the single_cell file

        nuclei_columns = [column["name"] for column in inspector.get_columns("Nuclei")]

        if not all(
            column_name in nuclei_columns
            for column_name in [
                self.image_column,
                self.table_column,
                self.object_column,
                self.cell_x_loc,
                self.cell_y_loc,
            ]
        ):
            raise ValueError(
                "Required columns are not present in the Nuclei table in the SQLite file"
            )

        image_columns = [column["name"] for column in inspector.get_columns("Image")]

        if not (
            self.image_column in image_columns
            and self.table_column in image_columns
            and all(elem in image_columns for elem in self.image_key)
        ):
            raise ValueError(
                "Required columns are not present in the Image table in the SQLite file"
            )

    def _get_joined_image_nuclei_tables(self):
        """Merge the Image and Nuclei tables in SQL."""
        # get the sqlalchemy.engine.Engine object for the single_cell file
        temp_single_cell_input, engine = self._get_single_cell_engine()

        # check that the single_cell file has the required tables and columns
        self._check_single_cell_correctness(engine)

        image_index_str = ", ".join(self.image_key)

        # merge the Image and Nuclei tables in SQL

        join_query = f"""
        SELECT Nuclei.{self.table_column},Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str}
        FROM Nuclei
        INNER JOIN Image
        ON Nuclei.{self.image_column} = Image.{self.image_column} and Nuclei.{self.table_column} = Image.{self.table_column};
        """

        column_types = {
            self.image_column: "int64",
            self.table_column: "int64",
            self.object_column: "int64",
            self.cell_x_loc: "float",
            self.cell_y_loc: "float",
        }

        for image_key in self.image_key:
            column_types[image_key] = "str"

        joined_df = pd.read_sql_query(join_query, engine, dtype=column_types)

        # if the single_cell file was downloaded from S3, delete the temporary file
        if temp_single_cell_input is not None:
            pathlib.Path(temp_single_cell_input).unlink()

        return joined_df

    def _load_single_cell(self):
        """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlalchemy.engine.Engine object into a Pandas DataFrame.

        Returns
        -------
        Pandas DataFrame
            The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame
        """
        return self._create_nested_df(self._get_joined_image_nuclei_tables())

    def add_cell_location(self):
        """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

        Optionally, save the augmented metadata file as a Parquet file.

        Returns
        -------
        Pandas DataFrame
            Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column
        """
        # If self.augmented_metadata_output is not None and it is a str and the file already exists, there is nothing to do
        if (
            self.augmented_metadata_output is not None
            and isinstance(self.augmented_metadata_output, str)
            and self.overwrite is False
            and (
                # Check if the file exists on S3 or locally
                (
                    self.augmented_metadata_output.startswith("s3://")
                    and self._s3_file_exists(self.augmented_metadata_output)
                )
                or (
                    not self.augmented_metadata_output.startswith("s3://")
                    and pathlib.Path(self.augmented_metadata_output).exists()
                )
            )
        ):
            # TODO: Consider doing a quick difference check should the file already exist.
            # For example, if the file already exists and it's different than what could be possibly incoming, should the user know?
            # This will involve performing all the steps below and then doing a check to see if the file is different, so this is a bit of a pain.
            return self.augmented_metadata_output

        # Load the data
        metadata_df = self._load_metadata()
        single_cell_df = self._load_single_cell()

        # Merge the data and single_cell tables
        augmented_metadata_df = pd.merge(
            metadata_df,
            single_cell_df,
            on=self.image_key,
            how="left",
        )

        # If self.augmented_metadata_output is not None, save the data
        if self.augmented_metadata_output is not None:
            # TODO: switch to https://github.com/cytomining/pycytominer/blob/main/pycytominer/cyto_utils/output.py if we want to support more file types
            augmented_metadata_df.to_parquet(
                self.augmented_metadata_output, index=False
            )
            return self.augmented_metadata_output
        else:
            return augmented_metadata_df

add_cell_location()

Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

Optionally, save the augmented metadata file as a Parquet file.

Returns:

Type Description
Pandas DataFrame

Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column

Source code in pycytominer/cyto_utils/cell_locations.py
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
def add_cell_location(self):
    """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

    Optionally, save the augmented metadata file as a Parquet file.

    Returns
    -------
    Pandas DataFrame
        Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column
    """
    # If self.augmented_metadata_output is not None and it is a str and the file already exists, there is nothing to do
    if (
        self.augmented_metadata_output is not None
        and isinstance(self.augmented_metadata_output, str)
        and self.overwrite is False
        and (
            # Check if the file exists on S3 or locally
            (
                self.augmented_metadata_output.startswith("s3://")
                and self._s3_file_exists(self.augmented_metadata_output)
            )
            or (
                not self.augmented_metadata_output.startswith("s3://")
                and pathlib.Path(self.augmented_metadata_output).exists()
            )
        )
    ):
        # TODO: Consider doing a quick difference check should the file already exist.
        # For example, if the file already exists and it's different than what could be possibly incoming, should the user know?
        # This will involve performing all the steps below and then doing a check to see if the file is different, so this is a bit of a pain.
        return self.augmented_metadata_output

    # Load the data
    metadata_df = self._load_metadata()
    single_cell_df = self._load_single_cell()

    # Merge the data and single_cell tables
    augmented_metadata_df = pd.merge(
        metadata_df,
        single_cell_df,
        on=self.image_key,
        how="left",
    )

    # If self.augmented_metadata_output is not None, save the data
    if self.augmented_metadata_output is not None:
        # TODO: switch to https://github.com/cytomining/pycytominer/blob/main/pycytominer/cyto_utils/output.py if we want to support more file types
        augmented_metadata_df.to_parquet(
            self.augmented_metadata_output, index=False
        )
        return self.augmented_metadata_output
    else:
        return augmented_metadata_df

pycytominer.cyto_utils.cell_locations_cmd

CLI for cell location calculations.

pycytominer.cyto_utils.cells

Module containing the SingleCells class, which is used to interact with single cell morphological profiles.

SingleCells

Class to interact with single cell morphological profiles including aggregation, normalization, and output.

Attributes:

Name Type Description
sql_file str

SQLite connection pointing to the single cell database. The string prefix must be "sqlite:///".

strata list of str, default ["Metadata_Plate", "Metadata_Well"]

The columns to groupby and aggregate single cells.

aggregation_operation str, default "median"

Operation to perform single cell aggregation.

output_file str, default None

If specified, the location to write the file.

compartments list of str, default ["cells", "cytoplasm", "nuclei"]

List of compartments to process.

compartment_linking_cols dict, default noted below

Dictionary identifying how to merge columns across tables.

merge_cols list of str, default ["TableNumber", "ImageNumber"]

Columns indicating how to merge image and compartment data.

image_cols list of str, default ["TableNumber", "ImageNumber", "Metadata_Site"]

Columns to select from the image table.

add_image_features bool, default False

Whether to add image features to the profiles.

image_feature_categories list of str, optional

List of categories of features from the image table to add to the profiles.

features str or list of str, default "infer"

List of features that should be loaded or aggregated.

load_image_data bool, default True

Whether or not the image data should be loaded into memory.

image_table_name str, default "image"

The name of the table inside the SQLite file of image measurements.

subsample_frac float, default 1

The percentage of single cells to select (0 < subsample_frac <= 1).

subsample_n str or int, default "all"

How many samples to subsample - do not specify both subsample_frac and subsample_n.

subsampling_random_state str or int, default None

The random state to init subsample.

fields_of_view list of int, str, default "all"

List of fields of view to aggregate.

fields_of_view_feature str, default "Metadata_Site"

Name of the fields of view feature.

object_feature str, default "Metadata_ObjectNumber"

Object number feature.

default_datatype_float type

Numpy floating point datatype to use for load_compartment and resulting dataframes. This parameter may be used to assist with performance-related issues by reducing the memory required for floating-point data. For example, using np.float32 instead of np.float64 for this parameter will reduce memory consumed by float columns by roughly 50%. Please note: using any besides np.float64 are experimentally unverified.

Notes

.. note:: the argument compartment_linking_cols is designed to work with CellProfiler output, as curated by cytominer-database. The default is: { "cytoplasm": { "cells": "Cytoplasm_Parent_Cells", "nuclei": "Cytoplasm_Parent_Nuclei", }, "cells": {"cytoplasm": "ObjectNumber"}, "nuclei": {"cytoplasm": "ObjectNumber"}, }

Source code in pycytominer/cyto_utils/cells.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
class SingleCells:
    """Class to interact with single cell morphological profiles including aggregation, normalization, and output.

    Attributes
    ----------
    sql_file : str
        SQLite connection pointing to the single cell database.
        The string prefix must be "sqlite:///".
    strata : list of str, default ["Metadata_Plate", "Metadata_Well"]
        The columns to groupby and aggregate single cells.
    aggregation_operation : str, default "median"
        Operation to perform single cell aggregation.
    output_file : str, default None
        If specified, the location to write the file.
    compartments : list of str, default ["cells", "cytoplasm", "nuclei"]
        List of compartments to process.
    compartment_linking_cols : dict, default noted below
        Dictionary identifying how to merge columns across tables.
    merge_cols : list of str, default ["TableNumber", "ImageNumber"]
        Columns indicating how to merge image and compartment data.
    image_cols : list of str, default ["TableNumber", "ImageNumber", "Metadata_Site"]
        Columns to select from the image table.
    add_image_features: bool, default False
        Whether to add image features to the profiles.
    image_feature_categories : list of str, optional
        List of categories of features from the image table to add to the profiles.
    features: str or list of str, default "infer"
        List of features that should be loaded or aggregated.
    load_image_data : bool, default True
        Whether or not the image data should be loaded into memory.
    image_table_name : str, default "image"
        The name of the table inside the SQLite file of image measurements.
    subsample_frac : float, default 1
        The percentage of single cells to select (0 < subsample_frac <= 1).
    subsample_n : str or int, default "all"
        How many samples to subsample - do not specify both subsample_frac and subsample_n.
    subsampling_random_state : str or int, default None
        The random state to init subsample.
    fields_of_view : list of int, str, default "all"
        List of fields of view to aggregate.
    fields_of_view_feature : str, default "Metadata_Site"
        Name of the fields of view feature.
    object_feature : str, default "Metadata_ObjectNumber"
        Object number feature.
    default_datatype_float: type
        Numpy floating point datatype to use for load_compartment and resulting
        dataframes. This parameter may be used to assist with performance-related
        issues by reducing the memory required for floating-point data.
        For example, using np.float32 instead of np.float64 for this parameter
        will reduce memory consumed by float columns by roughly 50%.
        Please note: using any besides np.float64 are experimentally
        unverified.

    Notes
    -----
    .. note::
        the argument compartment_linking_cols is designed to work with CellProfiler output,
        as curated by cytominer-database. The default is: {
            "cytoplasm": {
                "cells": "Cytoplasm_Parent_Cells",
                "nuclei": "Cytoplasm_Parent_Nuclei",
            },
            "cells": {"cytoplasm": "ObjectNumber"},
            "nuclei": {"cytoplasm": "ObjectNumber"},
        }
    """

    def __init__(
        self,
        sql_file,
        strata=["Metadata_Plate", "Metadata_Well"],
        aggregation_operation="median",
        output_file=None,
        compartments=default_compartments,
        compartment_linking_cols=default_linking_cols,
        merge_cols=["TableNumber", "ImageNumber"],
        image_cols=["TableNumber", "ImageNumber", "Metadata_Site"],
        add_image_features=False,
        image_feature_categories=None,
        features="infer",
        load_image_data=True,
        image_table_name="image",
        subsample_frac=1,
        subsample_n="all",
        subsampling_random_state=None,
        fields_of_view="all",
        fields_of_view_feature="Metadata_Site",
        object_feature="Metadata_ObjectNumber",
        default_datatype_float=np.float64,
    ):
        """Construct a SingleCells object."""
        # Check compartments specified
        check_compartments(compartments)

        # Check if correct operation is specified
        aggregation_operation = check_aggregate_operation(aggregation_operation)

        # Check that the subsample_frac is between 0 and 1
        assert (  # noqa: S101
            subsample_frac > 0 and subsample_frac <= 1
        ), "subsample_frac must be between 0 and 1"

        self.sql_file = sql_file
        self.strata = strata
        self.load_image_data = load_image_data
        self.image_table_name = image_table_name
        self.aggregation_operation = aggregation_operation.lower()
        self.output_file = output_file
        self.merge_cols = merge_cols
        self.image_cols = image_cols
        self.add_image_features = add_image_features
        self.image_feature_categories = image_feature_categories
        self.features = features
        self.subsample_frac = subsample_frac
        self.subsample_n = subsample_n
        self.subset_data_df = None
        self.subsampling_random_state = subsampling_random_state
        self.is_aggregated = False
        self.is_subset_computed = False
        self.compartments = compartments
        self.compartment_linking_cols = compartment_linking_cols
        self.fields_of_view_feature = fields_of_view_feature
        self.object_feature = object_feature
        self.default_datatype_float = default_datatype_float

        # Confirm that the compartments and linking cols are formatted properly
        assert_linking_cols_complete(
            compartments=self.compartments, linking_cols=self.compartment_linking_cols
        )

        # Build a dictionary to update linking column feature names
        self.linking_col_rename = provide_linking_cols_feature_name_update(
            self.compartment_linking_cols
        )

        if self.subsample_n != "all":
            self.set_subsample_n(self.subsample_n)

        # Connect to sqlite engine
        self.engine = create_engine(self.sql_file)
        self.conn = self.engine.connect()

        # Throw an error if both subsample_frac and subsample_n is set
        self._check_subsampling()

        # Confirm that the input fields of view is valid
        self.fields_of_view = check_fields_of_view_format(fields_of_view)

        # attribute to track image table data load status
        self.image_data_loaded = False
        if self.load_image_data:
            self.load_image(image_table_name=self.image_table_name)

    def _check_subsampling(self):
        """Check if subsampling options were specified correctly.

        Returns
        -------
        None
            Nothing is returned.
        """
        # Check that the user didn't specify both subset frac and subsample all
        assert (  # noqa: S101
            self.subsample_frac == 1 or self.subsample_n == "all"
        ), "Do not set both subsample_frac and subsample_n"

    def set_output_file(self, output_file):
        """Set or modify output file.

        Parameters
        ----------
        output_file : str
            New output file name.

        Returns
        -------
        None
            Nothing is returned.
        """
        self.output_file = output_file

    def set_subsample_frac(self, subsample_frac):
        """Set or update the subsample fraction.

        Parameters
        ----------
        subsample_frac : float, default 1
            Percentage of single cells to select (0 < subsample_frac <= 1).

        Returns
        -------
        None
            Nothing is returned.
        """
        self.subsample_frac = subsample_frac
        self._check_subsampling()

    def set_subsample_n(self, subsample_n):
        """Set or update the subsample n.

        Parameters
        ----------
        subsample_n : int, default "all"
            Indicate how many sample to subsample - do not specify both subsample_frac and subsample_n.

        Returns
        -------
        None
            Nothing is returned.
        """
        try:
            self.subsample_n = int(subsample_n)
        except ValueError:
            raise ValueError("subsample n must be an integer or coercable")
        self._check_subsampling()

    def set_subsample_random_state(self, random_state):
        """Set or update the subsample random state.

        Parameters
        ----------
        random_state: int, optional
            The random state to init subsample.

        Returns
        -------
        None
            Nothing is returned.
        """
        self.subsampling_random_state = random_state

    def load_image(self, image_table_name=None):
        """Load image table from sqlite file.

        Returns
        -------
        None
            Nothing is returned.
        """
        if image_table_name is None:
            image_table_name = self.image_table_name

        image_query = f"select * from {image_table_name}"
        self.image_df = pd.read_sql(sql=image_query, con=self.conn)

        if self.add_image_features:
            self.image_features_df = extract_image_features(
                self.image_feature_categories,
                self.image_df,
                self.image_cols,
                self.strata,
            )

        image_features = list(np.union1d(self.image_cols, self.strata))
        self.image_df = self.image_df[image_features]

        if self.fields_of_view != "all":
            check_fields_of_view(
                list(np.unique(self.image_df[self.fields_of_view_feature])),
                list(self.fields_of_view),
            )
            self.image_df = self.image_df.query(
                f"{self.fields_of_view_feature}==@self.fields_of_view"
            )

            if self.add_image_features:
                self.image_features_df = self.image_features_df.query(
                    f"{self.fields_of_view_feature}==@self.fields_of_view"
                )

        self.image_data_loaded = True

    def count_cells(self, compartment="cells", count_subset=False):
        """Determine how many cells are measured per well.

        Parameters
        ----------
        compartment : str, default "cells"
            Compartment to subset.
        count_subset : bool, default False
            Whether or not count the number of cells as specified by the strata groups.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of cell counts in the experiment.
        """
        check_compartments(compartment)

        if count_subset:
            assert self.is_aggregated, "Make sure to aggregate_profiles() first!"  # noqa: S101
            assert self.is_subset_computed, "Make sure to get_subsample() first!"  # noqa: S101
            count_df = (
                self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"]
                .count()
                .reset_index()
                .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns")
            )
        else:
            query_cols = "TableNumber, ImageNumber, ObjectNumber"
            query = f"select {query_cols} from {compartment}"
            count_df = self.image_df.merge(
                pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
            )
            count_df = (
                count_df.groupby(self.strata)["ObjectNumber"]
                .count()
                .reset_index()
                .rename({"ObjectNumber": "cell_count"}, axis="columns")
            )

        return count_df

    def subsample_profiles(self, df, rename_col=True):
        """Sample a Pandas DataFrame given subsampling information.

        Parameters
        ----------
        df : pandas.core.frame.DataFrame
            DataFrame of a single cell profile.
        rename_col : bool, default True
            Whether or not to rename the columns.

        Returns
        -------
        pandas.core.frame.DataFrame
            A subsampled pandas dataframe of single cell profiles.
        """
        if self.subsampling_random_state is None:
            random_state = np.random.randint(0, 10000, size=1)[0]
            self.set_subsample_random_state(random_state)

        if self.subsample_frac == 1:
            output_df = pd.DataFrame.sample(
                df,
                n=self.subsample_n,
                replace=True,
                random_state=self.subsampling_random_state,
            )
        else:
            output_df = pd.DataFrame.sample(
                df, frac=self.subsample_frac, random_state=self.subsampling_random_state
            )

        if rename_col:
            output_df = output_df.rename(self.linking_col_rename, axis="columns")

        return output_df

    def get_subsample(self, df=None, compartment="cells", rename_col=True):
        """Apply the subsampling procedure.

        Parameters
        ----------
        df : pandas.core.frame.DataFrame
            DataFrame of a single cell profile.
        compartment : str, default "cells"
            The compartment to process.
        rename_col : bool, default True
            Whether or not to rename the columns.

        Returns
        -------
        None
            Nothing is returned.
        """
        check_compartments(compartment)

        query_cols = "TableNumber, ImageNumber, ObjectNumber"
        query = f"select {query_cols} from {compartment}"

        # Load query and merge with image_df
        if df is None:
            df = pd.read_sql(sql=query, con=self.conn)

        query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)

        self.subset_data_df = (
            query_df.groupby(self.strata)
            .apply(lambda x: self.subsample_profiles(x, rename_col=rename_col))
            .reset_index(drop=True)
        )

        self.is_subset_computed = True

    def count_sql_table_rows(self, table):
        """Count total number of rows for a table."""
        (num_rows,) = next(self.conn.execute(f"SELECT COUNT(*) FROM {table}"))
        return num_rows

    def get_sql_table_col_names(self, table):
        """Get column names from the database."""
        ptr = self.conn.execute(f"SELECT * FROM {table} LIMIT 1").cursor
        col_names = [obj[0] for obj in ptr.description]

        return col_names

    def split_column_categories(self, col_names):
        """Split a list of column names into feature and metadata columns lists."""
        feat_cols = []
        meta_cols = []
        for col in col_names:
            if col.lower().startswith(tuple(self.compartments)):
                feat_cols.append(col)
            else:
                meta_cols.append(col)

        return meta_cols, feat_cols

    def load_compartment(self, compartment):
        """Create the compartment dataframe.

        Note: makes use of default_datatype_float attribute
        for setting a default floating point datatype.

        Parameters
        ----------
        compartment : str
            The compartment to process.

        Returns
        -------
        pandas.core.frame.DataFrame
            Compartment dataframe.
        """
        # Get data useful to pre-alloc memory
        num_cells = self.count_sql_table_rows(compartment)
        col_names = self.get_sql_table_col_names(compartment)
        if self.features != "infer":  # allow to get only some features
            col_names = [x for x in col_names if x in self.features]
        meta_cols, feat_cols = self.split_column_categories(col_names)
        num_meta, num_feats = len(meta_cols), len(feat_cols)

        # Use pre-allocated np.array for feature data
        feats = np.empty(
            shape=(num_cells, num_feats), dtype=self.default_datatype_float
        )
        # Use pre-allocated pd.DataFrame for metadata
        metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

        # Query database for selected columns of chosen compartment
        columns = ", ".join(meta_cols + feat_cols)
        query = f"select {columns} from {compartment}"
        query_result = self.conn.execute(query)

        # Load data row by row for both meta information and features
        for i, row in enumerate(query_result):
            metas.loc[i] = row[:num_meta]
            feats[i] = row[num_meta:]

        # Return concatenated data and metainformation of compartment
        return pd.concat([metas, pd.DataFrame(columns=feat_cols, data=feats)], axis=1)

    def aggregate_compartment(
        self,
        compartment,
        compute_subsample=False,
        compute_counts=False,
        add_image_features=False,
        n_aggregation_memory_strata=1,
    ):
        """Aggregate morphological profiles. Uses pycytominer.aggregate().

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        compute_subsample : bool, default False
            Whether or not to subsample.
        compute_counts : bool, default False
            Whether or not to compute the number of objects in each compartment
            and the number of fields of view per well.
        add_image_features : bool, default False
            Whether or not to add image features.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.
            For example, if aggregating by "well", then n_aggregation_memory_strata=1
            means that one "well" will be pulled from the SQLite database into
            memory at a time.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of aggregated profiles.
        """
        check_compartments(compartment)

        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        # Load image data if not already loaded
        if not self.image_data_loaded:
            self.load_image(image_table_name=self.image_table_name)

        # Iteratively call aggregate() on chunks of the full compartment table
        object_dfs = []
        for compartment_df in self._compartment_df_generator(
            compartment=compartment,
            n_aggregation_memory_strata=n_aggregation_memory_strata,
        ):
            population_df = self.image_df.merge(
                compartment_df,
                how="inner",
                on=self.merge_cols,
            ).rename(self.linking_col_rename, axis="columns")

            if self.features == "infer":
                aggregate_features = infer_cp_features(
                    population_df, compartments=compartment
                )
            else:
                aggregate_features = self.features

            partial_object_df = aggregate(
                population_df=population_df,
                strata=self.strata,
                compute_object_count=compute_counts,
                operation=self.aggregation_operation,
                subset_data_df=self.subset_data_df,
                features=aggregate_features,
                object_feature=self.object_feature,
            )

            if compute_counts and self.fields_of_view_feature not in self.strata:
                fields_count_df = aggregate_fields_count(
                    self.image_df, self.strata, self.fields_of_view_feature
                )

                if add_image_features:
                    fields_count_df = aggregate_image_features(
                        fields_count_df,
                        self.image_features_df,
                        self.image_feature_categories,
                        self.image_cols,
                        self.strata,
                        self.aggregation_operation,
                    )

                partial_object_df = fields_count_df.merge(
                    partial_object_df,
                    on=self.strata,
                    how="right",
                )

                # Separate all the metadata and feature columns.
                metadata_cols = infer_cp_features(partial_object_df, metadata=True)
                feature_cols = infer_cp_features(partial_object_df, image_features=True)

                partial_object_df = partial_object_df.reindex(
                    columns=metadata_cols + feature_cols
                )

            object_dfs.append(partial_object_df)

        # Concatenate one or more aggregated dataframes row-wise into final output
        object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True)

        return object_df

    def _compartment_df_generator(
        self,
        compartment,
        n_aggregation_memory_strata=1,
    ):
        """Yield chunks of the entire compartment table from disk.

        We want to return dataframes with all compartment entries within unique
        combinations of self.merge_cols when aggregated by self.strata

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.

        Returns
        -------
        image_df : Iterator[pandas.core.frame.DataFrame]
            A generator whose __next__() call returns a chunk of the compartment
            table, where rows comprising a unique aggregation stratum are not split
            between chunks, and thus groupby aggregations are valid

        """
        assert (  # noqa: S101
            n_aggregation_memory_strata > 0
        ), "Number of strata to pull into memory at once (n_aggregation_memory_strata) must be > 0"

        # Obtain data types of all columns of the compartment table
        cols = "*"
        compartment_row1 = pd.read_sql(
            sql=f"select {cols} from {compartment} limit 1",
            con=self.conn,
        )
        all_columns = compartment_row1.columns
        if self.features != "infer":  # allow to get only some features
            all_columns = [x for x in all_columns if x in self.features]

        typeof_str = ", ".join([f"typeof({x})" for x in all_columns])
        compartment_dtypes = pd.read_sql(
            sql=f"select {typeof_str} from {compartment} limit 1",
            con=self.conn,
        )
        # Strip the characters "typeof(" from the beginning and ")" from the end of
        # compartment column names returned by SQLite
        strip_typeof = lambda s: s[7:-1]
        dtype_dict = dict(
            zip(
                [strip_typeof(s) for s in compartment_dtypes.columns],  # column names
                compartment_dtypes.iloc[0].values,  # corresponding data types
            )
        )

        # Obtain all valid strata combinations, and their merge_cols values
        df_unique_mergecols = (
            self.image_df[self.strata + self.merge_cols]
            .groupby(self.strata)
            .agg(lambda s: np.unique(s).tolist())
            .reset_index(drop=True)
        )

        # Group the unique strata values into a list of SQLite condition strings
        # Find unique aggregated strata for the output
        strata_conditions = _sqlite_strata_conditions(
            df=df_unique_mergecols,
            dtypes=dtype_dict,
            n=n_aggregation_memory_strata,
        )

        # The generator, for each group of compartment values
        for strata_condition in strata_conditions:
            specific_compartment_query = (
                f"select {cols} from {compartment} where {strata_condition}"
            )
            image_df_chunk = pd.read_sql(sql=specific_compartment_query, con=self.conn)
            yield image_df_chunk

    def merge_single_cells(
        self,
        compute_subsample: bool = False,
        sc_output_file: Optional[str] = None,
        compression_options: Optional[str] = None,
        float_format: Optional[str] = None,
        single_cell_normalize: bool = False,
        normalize_args: Optional[Dict] = None,
        platemap: Optional[Union[str, pd.DataFrame]] = None,
        **kwargs,
    ):
        """Given the linking columns, merge single cell data. Normalization is also supported.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample.
        sc_output_file : str, optional
            The name of a file to output.
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        single_cell_normalize : bool, default False
            Whether or not to normalize the single cell data.
        normalize_args : dict, optional
            Additional arguments passed as input to pycytominer.normalize().
        platemap: str or pd.DataFrame, default None
            optional platemap filepath str or pd.DataFrame to be used with results via annotate

        Returns
        -------
        pandas.core.frame.DataFrame or str
            if output_file=None returns a Pandas dataframe
            else will write to file and return the filepath of the file
        """
        # Load the single cell dataframe by merging on the specific linking columns
        sc_df = ""
        linking_check_cols = []
        merge_suffix_rename = []
        for left_compartment in self.compartment_linking_cols:
            for right_compartment in self.compartment_linking_cols[left_compartment]:
                # Make sure only one merge per combination occurs
                linking_check = "-".join(sorted([left_compartment, right_compartment]))
                if linking_check in linking_check_cols:
                    continue

                # Specify how to indicate merge suffixes
                merge_suffix = [
                    f"_{left_compartment}",
                    f"_{right_compartment}",
                ]
                merge_suffix_rename += merge_suffix
                left_link_col = self.compartment_linking_cols[left_compartment][
                    right_compartment
                ]
                right_link_col = self.compartment_linking_cols[right_compartment][
                    left_compartment
                ]

                if isinstance(sc_df, str):
                    sc_df = self.load_compartment(compartment=left_compartment)

                    if compute_subsample:
                        # Sample cells proportionally by self.strata
                        self.get_subsample(df=sc_df, rename_col=False)

                        subset_logic_df = self.subset_data_df.drop(
                            self.image_df.columns, axis="columns"
                        )

                        sc_df = subset_logic_df.merge(
                            sc_df, how="left", on=subset_logic_df.columns.tolist()
                        ).reindex(sc_df.columns, axis="columns")

                sc_df = sc_df.merge(
                    self.load_compartment(compartment=right_compartment),
                    left_on=[*self.merge_cols, left_link_col],
                    right_on=[*self.merge_cols, right_link_col],
                    suffixes=merge_suffix,
                )

                linking_check_cols.append(linking_check)

        # Add metadata prefix to merged suffixes
        full_merge_suffix_rename = []
        full_merge_suffix_original = []
        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            full_merge_suffix_original.append(col_name)
            full_merge_suffix_rename.append(f"Metadata_{col_name}")

        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            for suffix in set(merge_suffix_rename):
                full_merge_suffix_original.append(f"{col_name}{suffix}")
                full_merge_suffix_rename.append(f"Metadata_{col_name}{suffix}")

        self.full_merge_suffix_rename = dict(
            zip(full_merge_suffix_original, full_merge_suffix_rename)
        )

        # Add image data to single cell dataframe
        if not self.image_data_loaded:
            self.load_image(image_table_name=self.image_table_name)

        sc_df = (
            self.image_df.merge(sc_df, on=self.merge_cols, how="right")
            # pandas rename performance may be improved using copy=False, inplace=False
            # reference: https://ryanlstevens.github.io/2022-05-06-pandasColumnRenaming/
            .rename(self.linking_col_rename, axis="columns", copy=False, inplace=False)
            .rename(
                self.full_merge_suffix_rename, axis="columns", copy=False, inplace=False
            )
        )
        if single_cell_normalize:
            # Infering features is tricky with non-canonical data
            if normalize_args is None:
                normalize_args = {}
                features = infer_cp_features(sc_df, compartments=self.compartments)
            elif ("features" not in normalize_args) or (
                normalize_args["features"] == "infer"
            ):
                features = infer_cp_features(sc_df, compartments=self.compartments)
            else:
                features = normalize_args["features"]

            normalize_args["features"] = features

            sc_df = normalize(profiles=sc_df, **normalize_args)

        # In case platemap metadata is provided, use pycytominer.annotate for metadata
        if platemap is not None:
            sc_df = annotate(
                profiles=sc_df, platemap=platemap, output_file=None, **kwargs
            )

        # if output argument is provided, call it using df_merged_sc and kwargs
        if sc_output_file is not None:
            return output(
                df=sc_df,
                output_filename=sc_output_file,
                compression_options=compression_options,
                float_format=float_format,
                **kwargs,
            )
        else:
            return sc_df

    def aggregate_profiles(
        self,
        compute_subsample=False,
        output_file=None,
        compression_options=None,
        float_format=None,
        n_aggregation_memory_strata=1,
        **kwargs,
    ):
        """Aggregate and merge compartments. This is the primary entry to this class.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample. compute_subsample must be specified to perform subsampling.
            The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.
        output_file : str, optional
            The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.

        Returns
        -------
        pandas.core.frame.DataFrame or str
            if output_file=None) returns a Pandas dataframe
            else will write to file and return the filepath of the file
        """
        if output_file is not None:
            self.set_output_file(output_file)

        for compartment_idx, compartment in enumerate(self.compartments):
            if compartment_idx == 0:
                aggregated = self.aggregate_compartment(
                    compartment=compartment,
                    compute_subsample=compute_subsample,
                    compute_counts=True,
                    add_image_features=self.add_image_features,
                    n_aggregation_memory_strata=n_aggregation_memory_strata,
                )
            else:
                aggregated = aggregated.merge(
                    self.aggregate_compartment(
                        compartment=compartment,
                        n_aggregation_memory_strata=n_aggregation_memory_strata,
                    ),
                    on=self.strata,
                    how="inner",
                )

        self.is_aggregated = True

        if self.output_file is not None:
            return output(
                df=aggregated,
                output_filename=self.output_file,
                compression_options=compression_options,
                float_format=float_format,
                **kwargs,
            )
        else:
            return aggregated

__init__(sql_file, strata=['Metadata_Plate', 'Metadata_Well'], aggregation_operation='median', output_file=None, compartments=default_compartments, compartment_linking_cols=default_linking_cols, merge_cols=['TableNumber', 'ImageNumber'], image_cols=['TableNumber', 'ImageNumber', 'Metadata_Site'], add_image_features=False, image_feature_categories=None, features='infer', load_image_data=True, image_table_name='image', subsample_frac=1, subsample_n='all', subsampling_random_state=None, fields_of_view='all', fields_of_view_feature='Metadata_Site', object_feature='Metadata_ObjectNumber', default_datatype_float=np.float64)

Construct a SingleCells object.

Source code in pycytominer/cyto_utils/cells.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def __init__(
    self,
    sql_file,
    strata=["Metadata_Plate", "Metadata_Well"],
    aggregation_operation="median",
    output_file=None,
    compartments=default_compartments,
    compartment_linking_cols=default_linking_cols,
    merge_cols=["TableNumber", "ImageNumber"],
    image_cols=["TableNumber", "ImageNumber", "Metadata_Site"],
    add_image_features=False,
    image_feature_categories=None,
    features="infer",
    load_image_data=True,
    image_table_name="image",
    subsample_frac=1,
    subsample_n="all",
    subsampling_random_state=None,
    fields_of_view="all",
    fields_of_view_feature="Metadata_Site",
    object_feature="Metadata_ObjectNumber",
    default_datatype_float=np.float64,
):
    """Construct a SingleCells object."""
    # Check compartments specified
    check_compartments(compartments)

    # Check if correct operation is specified
    aggregation_operation = check_aggregate_operation(aggregation_operation)

    # Check that the subsample_frac is between 0 and 1
    assert (  # noqa: S101
        subsample_frac > 0 and subsample_frac <= 1
    ), "subsample_frac must be between 0 and 1"

    self.sql_file = sql_file
    self.strata = strata
    self.load_image_data = load_image_data
    self.image_table_name = image_table_name
    self.aggregation_operation = aggregation_operation.lower()
    self.output_file = output_file
    self.merge_cols = merge_cols
    self.image_cols = image_cols
    self.add_image_features = add_image_features
    self.image_feature_categories = image_feature_categories
    self.features = features
    self.subsample_frac = subsample_frac
    self.subsample_n = subsample_n
    self.subset_data_df = None
    self.subsampling_random_state = subsampling_random_state
    self.is_aggregated = False
    self.is_subset_computed = False
    self.compartments = compartments
    self.compartment_linking_cols = compartment_linking_cols
    self.fields_of_view_feature = fields_of_view_feature
    self.object_feature = object_feature
    self.default_datatype_float = default_datatype_float

    # Confirm that the compartments and linking cols are formatted properly
    assert_linking_cols_complete(
        compartments=self.compartments, linking_cols=self.compartment_linking_cols
    )

    # Build a dictionary to update linking column feature names
    self.linking_col_rename = provide_linking_cols_feature_name_update(
        self.compartment_linking_cols
    )

    if self.subsample_n != "all":
        self.set_subsample_n(self.subsample_n)

    # Connect to sqlite engine
    self.engine = create_engine(self.sql_file)
    self.conn = self.engine.connect()

    # Throw an error if both subsample_frac and subsample_n is set
    self._check_subsampling()

    # Confirm that the input fields of view is valid
    self.fields_of_view = check_fields_of_view_format(fields_of_view)

    # attribute to track image table data load status
    self.image_data_loaded = False
    if self.load_image_data:
        self.load_image(image_table_name=self.image_table_name)

aggregate_compartment(compartment, compute_subsample=False, compute_counts=False, add_image_features=False, n_aggregation_memory_strata=1)

Aggregate morphological profiles. Uses pycytominer.aggregate().

Parameters:

Name Type Description Default
compartment str

Compartment to aggregate.

required
compute_subsample bool

Whether or not to subsample.

False
compute_counts bool

Whether or not to compute the number of objects in each compartment and the number of fields of view per well.

False
add_image_features bool

Whether or not to add image features.

False
n_aggregation_memory_strata int

Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory. For example, if aggregating by "well", then n_aggregation_memory_strata=1 means that one "well" will be pulled from the SQLite database into memory at a time.

1

Returns:

Type Description
DataFrame

DataFrame of aggregated profiles.

Source code in pycytominer/cyto_utils/cells.py
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
def aggregate_compartment(
    self,
    compartment,
    compute_subsample=False,
    compute_counts=False,
    add_image_features=False,
    n_aggregation_memory_strata=1,
):
    """Aggregate morphological profiles. Uses pycytominer.aggregate().

    Parameters
    ----------
    compartment : str
        Compartment to aggregate.
    compute_subsample : bool, default False
        Whether or not to subsample.
    compute_counts : bool, default False
        Whether or not to compute the number of objects in each compartment
        and the number of fields of view per well.
    add_image_features : bool, default False
        Whether or not to add image features.
    n_aggregation_memory_strata : int, default 1
        Number of unique strata to pull from the database into working memory
        at once.  Typically 1 is fastest.  A larger number uses more memory.
        For example, if aggregating by "well", then n_aggregation_memory_strata=1
        means that one "well" will be pulled from the SQLite database into
        memory at a time.

    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of aggregated profiles.
    """
    check_compartments(compartment)

    if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
        self.get_subsample(compartment=compartment)

    # Load image data if not already loaded
    if not self.image_data_loaded:
        self.load_image(image_table_name=self.image_table_name)

    # Iteratively call aggregate() on chunks of the full compartment table
    object_dfs = []
    for compartment_df in self._compartment_df_generator(
        compartment=compartment,
        n_aggregation_memory_strata=n_aggregation_memory_strata,
    ):
        population_df = self.image_df.merge(
            compartment_df,
            how="inner",
            on=self.merge_cols,
        ).rename(self.linking_col_rename, axis="columns")

        if self.features == "infer":
            aggregate_features = infer_cp_features(
                population_df, compartments=compartment
            )
        else:
            aggregate_features = self.features

        partial_object_df = aggregate(
            population_df=population_df,
            strata=self.strata,
            compute_object_count=compute_counts,
            operation=self.aggregation_operation,
            subset_data_df=self.subset_data_df,
            features=aggregate_features,
            object_feature=self.object_feature,
        )

        if compute_counts and self.fields_of_view_feature not in self.strata:
            fields_count_df = aggregate_fields_count(
                self.image_df, self.strata, self.fields_of_view_feature
            )

            if add_image_features:
                fields_count_df = aggregate_image_features(
                    fields_count_df,
                    self.image_features_df,
                    self.image_feature_categories,
                    self.image_cols,
                    self.strata,
                    self.aggregation_operation,
                )

            partial_object_df = fields_count_df.merge(
                partial_object_df,
                on=self.strata,
                how="right",
            )

            # Separate all the metadata and feature columns.
            metadata_cols = infer_cp_features(partial_object_df, metadata=True)
            feature_cols = infer_cp_features(partial_object_df, image_features=True)

            partial_object_df = partial_object_df.reindex(
                columns=metadata_cols + feature_cols
            )

        object_dfs.append(partial_object_df)

    # Concatenate one or more aggregated dataframes row-wise into final output
    object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True)

    return object_df

aggregate_profiles(compute_subsample=False, output_file=None, compression_options=None, float_format=None, n_aggregation_memory_strata=1, **kwargs)

Aggregate and merge compartments. This is the primary entry to this class.

Parameters:

Name Type Description Default
compute_subsample bool

Whether or not to compute subsample. compute_subsample must be specified to perform subsampling. The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.

False
output_file str

The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".

None
compression_options str

Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.

None
float_format str

Decimal precision to use in writing output file.

None
n_aggregation_memory_strata int

Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory.

1

Returns:

Type Description
DataFrame or str

if output_file=None) returns a Pandas dataframe else will write to file and return the filepath of the file

Source code in pycytominer/cyto_utils/cells.py
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
def aggregate_profiles(
    self,
    compute_subsample=False,
    output_file=None,
    compression_options=None,
    float_format=None,
    n_aggregation_memory_strata=1,
    **kwargs,
):
    """Aggregate and merge compartments. This is the primary entry to this class.

    Parameters
    ----------
    compute_subsample : bool, default False
        Whether or not to compute subsample. compute_subsample must be specified to perform subsampling.
        The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.
    output_file : str, optional
        The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".
    compression_options : str, optional
        Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file.
    n_aggregation_memory_strata : int, default 1
        Number of unique strata to pull from the database into working memory
        at once.  Typically 1 is fastest.  A larger number uses more memory.

    Returns
    -------
    pandas.core.frame.DataFrame or str
        if output_file=None) returns a Pandas dataframe
        else will write to file and return the filepath of the file
    """
    if output_file is not None:
        self.set_output_file(output_file)

    for compartment_idx, compartment in enumerate(self.compartments):
        if compartment_idx == 0:
            aggregated = self.aggregate_compartment(
                compartment=compartment,
                compute_subsample=compute_subsample,
                compute_counts=True,
                add_image_features=self.add_image_features,
                n_aggregation_memory_strata=n_aggregation_memory_strata,
            )
        else:
            aggregated = aggregated.merge(
                self.aggregate_compartment(
                    compartment=compartment,
                    n_aggregation_memory_strata=n_aggregation_memory_strata,
                ),
                on=self.strata,
                how="inner",
            )

    self.is_aggregated = True

    if self.output_file is not None:
        return output(
            df=aggregated,
            output_filename=self.output_file,
            compression_options=compression_options,
            float_format=float_format,
            **kwargs,
        )
    else:
        return aggregated

count_cells(compartment='cells', count_subset=False)

Determine how many cells are measured per well.

Parameters:

Name Type Description Default
compartment str

Compartment to subset.

"cells"
count_subset bool

Whether or not count the number of cells as specified by the strata groups.

False

Returns:

Type Description
DataFrame

DataFrame of cell counts in the experiment.

Source code in pycytominer/cyto_utils/cells.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def count_cells(self, compartment="cells", count_subset=False):
    """Determine how many cells are measured per well.

    Parameters
    ----------
    compartment : str, default "cells"
        Compartment to subset.
    count_subset : bool, default False
        Whether or not count the number of cells as specified by the strata groups.

    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of cell counts in the experiment.
    """
    check_compartments(compartment)

    if count_subset:
        assert self.is_aggregated, "Make sure to aggregate_profiles() first!"  # noqa: S101
        assert self.is_subset_computed, "Make sure to get_subsample() first!"  # noqa: S101
        count_df = (
            self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"]
            .count()
            .reset_index()
            .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns")
        )
    else:
        query_cols = "TableNumber, ImageNumber, ObjectNumber"
        query = f"select {query_cols} from {compartment}"
        count_df = self.image_df.merge(
            pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
        )
        count_df = (
            count_df.groupby(self.strata)["ObjectNumber"]
            .count()
            .reset_index()
            .rename({"ObjectNumber": "cell_count"}, axis="columns")
        )

    return count_df

count_sql_table_rows(table)

Count total number of rows for a table.

Source code in pycytominer/cyto_utils/cells.py
414
415
416
417
def count_sql_table_rows(self, table):
    """Count total number of rows for a table."""
    (num_rows,) = next(self.conn.execute(f"SELECT COUNT(*) FROM {table}"))
    return num_rows

get_sql_table_col_names(table)

Get column names from the database.

Source code in pycytominer/cyto_utils/cells.py
419
420
421
422
423
424
def get_sql_table_col_names(self, table):
    """Get column names from the database."""
    ptr = self.conn.execute(f"SELECT * FROM {table} LIMIT 1").cursor
    col_names = [obj[0] for obj in ptr.description]

    return col_names

get_subsample(df=None, compartment='cells', rename_col=True)

Apply the subsampling procedure.

Parameters:

Name Type Description Default
df DataFrame

DataFrame of a single cell profile.

None
compartment str

The compartment to process.

"cells"
rename_col bool

Whether or not to rename the columns.

True

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
def get_subsample(self, df=None, compartment="cells", rename_col=True):
    """Apply the subsampling procedure.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame of a single cell profile.
    compartment : str, default "cells"
        The compartment to process.
    rename_col : bool, default True
        Whether or not to rename the columns.

    Returns
    -------
    None
        Nothing is returned.
    """
    check_compartments(compartment)

    query_cols = "TableNumber, ImageNumber, ObjectNumber"
    query = f"select {query_cols} from {compartment}"

    # Load query and merge with image_df
    if df is None:
        df = pd.read_sql(sql=query, con=self.conn)

    query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)

    self.subset_data_df = (
        query_df.groupby(self.strata)
        .apply(lambda x: self.subsample_profiles(x, rename_col=rename_col))
        .reset_index(drop=True)
    )

    self.is_subset_computed = True

load_compartment(compartment)

Create the compartment dataframe.

Note: makes use of default_datatype_float attribute for setting a default floating point datatype.

Parameters:

Name Type Description Default
compartment str

The compartment to process.

required

Returns:

Type Description
DataFrame

Compartment dataframe.

Source code in pycytominer/cyto_utils/cells.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
def load_compartment(self, compartment):
    """Create the compartment dataframe.

    Note: makes use of default_datatype_float attribute
    for setting a default floating point datatype.

    Parameters
    ----------
    compartment : str
        The compartment to process.

    Returns
    -------
    pandas.core.frame.DataFrame
        Compartment dataframe.
    """
    # Get data useful to pre-alloc memory
    num_cells = self.count_sql_table_rows(compartment)
    col_names = self.get_sql_table_col_names(compartment)
    if self.features != "infer":  # allow to get only some features
        col_names = [x for x in col_names if x in self.features]
    meta_cols, feat_cols = self.split_column_categories(col_names)
    num_meta, num_feats = len(meta_cols), len(feat_cols)

    # Use pre-allocated np.array for feature data
    feats = np.empty(
        shape=(num_cells, num_feats), dtype=self.default_datatype_float
    )
    # Use pre-allocated pd.DataFrame for metadata
    metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

    # Query database for selected columns of chosen compartment
    columns = ", ".join(meta_cols + feat_cols)
    query = f"select {columns} from {compartment}"
    query_result = self.conn.execute(query)

    # Load data row by row for both meta information and features
    for i, row in enumerate(query_result):
        metas.loc[i] = row[:num_meta]
        feats[i] = row[num_meta:]

    # Return concatenated data and metainformation of compartment
    return pd.concat([metas, pd.DataFrame(columns=feat_cols, data=feats)], axis=1)

load_image(image_table_name=None)

Load image table from sqlite file.

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
def load_image(self, image_table_name=None):
    """Load image table from sqlite file.

    Returns
    -------
    None
        Nothing is returned.
    """
    if image_table_name is None:
        image_table_name = self.image_table_name

    image_query = f"select * from {image_table_name}"
    self.image_df = pd.read_sql(sql=image_query, con=self.conn)

    if self.add_image_features:
        self.image_features_df = extract_image_features(
            self.image_feature_categories,
            self.image_df,
            self.image_cols,
            self.strata,
        )

    image_features = list(np.union1d(self.image_cols, self.strata))
    self.image_df = self.image_df[image_features]

    if self.fields_of_view != "all":
        check_fields_of_view(
            list(np.unique(self.image_df[self.fields_of_view_feature])),
            list(self.fields_of_view),
        )
        self.image_df = self.image_df.query(
            f"{self.fields_of_view_feature}==@self.fields_of_view"
        )

        if self.add_image_features:
            self.image_features_df = self.image_features_df.query(
                f"{self.fields_of_view_feature}==@self.fields_of_view"
            )

    self.image_data_loaded = True

merge_single_cells(compute_subsample=False, sc_output_file=None, compression_options=None, float_format=None, single_cell_normalize=False, normalize_args=None, platemap=None, **kwargs)

Given the linking columns, merge single cell data. Normalization is also supported.

Parameters:

Name Type Description Default
compute_subsample bool

Whether or not to compute subsample.

False
sc_output_file str

The name of a file to output.

None
compression_options str

Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.

None
float_format str

Decimal precision to use in writing output file.

None
single_cell_normalize bool

Whether or not to normalize the single cell data.

False
normalize_args dict

Additional arguments passed as input to pycytominer.normalize().

None
platemap Optional[Union[str, DataFrame]]

optional platemap filepath str or pd.DataFrame to be used with results via annotate

None

Returns:

Type Description
DataFrame or str

if output_file=None returns a Pandas dataframe else will write to file and return the filepath of the file

Source code in pycytominer/cyto_utils/cells.py
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
def merge_single_cells(
    self,
    compute_subsample: bool = False,
    sc_output_file: Optional[str] = None,
    compression_options: Optional[str] = None,
    float_format: Optional[str] = None,
    single_cell_normalize: bool = False,
    normalize_args: Optional[Dict] = None,
    platemap: Optional[Union[str, pd.DataFrame]] = None,
    **kwargs,
):
    """Given the linking columns, merge single cell data. Normalization is also supported.

    Parameters
    ----------
    compute_subsample : bool, default False
        Whether or not to compute subsample.
    sc_output_file : str, optional
        The name of a file to output.
    compression_options : str, optional
        Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file.
    single_cell_normalize : bool, default False
        Whether or not to normalize the single cell data.
    normalize_args : dict, optional
        Additional arguments passed as input to pycytominer.normalize().
    platemap: str or pd.DataFrame, default None
        optional platemap filepath str or pd.DataFrame to be used with results via annotate

    Returns
    -------
    pandas.core.frame.DataFrame or str
        if output_file=None returns a Pandas dataframe
        else will write to file and return the filepath of the file
    """
    # Load the single cell dataframe by merging on the specific linking columns
    sc_df = ""
    linking_check_cols = []
    merge_suffix_rename = []
    for left_compartment in self.compartment_linking_cols:
        for right_compartment in self.compartment_linking_cols[left_compartment]:
            # Make sure only one merge per combination occurs
            linking_check = "-".join(sorted([left_compartment, right_compartment]))
            if linking_check in linking_check_cols:
                continue

            # Specify how to indicate merge suffixes
            merge_suffix = [
                f"_{left_compartment}",
                f"_{right_compartment}",
            ]
            merge_suffix_rename += merge_suffix
            left_link_col = self.compartment_linking_cols[left_compartment][
                right_compartment
            ]
            right_link_col = self.compartment_linking_cols[right_compartment][
                left_compartment
            ]

            if isinstance(sc_df, str):
                sc_df = self.load_compartment(compartment=left_compartment)

                if compute_subsample:
                    # Sample cells proportionally by self.strata
                    self.get_subsample(df=sc_df, rename_col=False)

                    subset_logic_df = self.subset_data_df.drop(
                        self.image_df.columns, axis="columns"
                    )

                    sc_df = subset_logic_df.merge(
                        sc_df, how="left", on=subset_logic_df.columns.tolist()
                    ).reindex(sc_df.columns, axis="columns")

            sc_df = sc_df.merge(
                self.load_compartment(compartment=right_compartment),
                left_on=[*self.merge_cols, left_link_col],
                right_on=[*self.merge_cols, right_link_col],
                suffixes=merge_suffix,
            )

            linking_check_cols.append(linking_check)

    # Add metadata prefix to merged suffixes
    full_merge_suffix_rename = []
    full_merge_suffix_original = []
    for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
        full_merge_suffix_original.append(col_name)
        full_merge_suffix_rename.append(f"Metadata_{col_name}")

    for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
        for suffix in set(merge_suffix_rename):
            full_merge_suffix_original.append(f"{col_name}{suffix}")
            full_merge_suffix_rename.append(f"Metadata_{col_name}{suffix}")

    self.full_merge_suffix_rename = dict(
        zip(full_merge_suffix_original, full_merge_suffix_rename)
    )

    # Add image data to single cell dataframe
    if not self.image_data_loaded:
        self.load_image(image_table_name=self.image_table_name)

    sc_df = (
        self.image_df.merge(sc_df, on=self.merge_cols, how="right")
        # pandas rename performance may be improved using copy=False, inplace=False
        # reference: https://ryanlstevens.github.io/2022-05-06-pandasColumnRenaming/
        .rename(self.linking_col_rename, axis="columns", copy=False, inplace=False)
        .rename(
            self.full_merge_suffix_rename, axis="columns", copy=False, inplace=False
        )
    )
    if single_cell_normalize:
        # Infering features is tricky with non-canonical data
        if normalize_args is None:
            normalize_args = {}
            features = infer_cp_features(sc_df, compartments=self.compartments)
        elif ("features" not in normalize_args) or (
            normalize_args["features"] == "infer"
        ):
            features = infer_cp_features(sc_df, compartments=self.compartments)
        else:
            features = normalize_args["features"]

        normalize_args["features"] = features

        sc_df = normalize(profiles=sc_df, **normalize_args)

    # In case platemap metadata is provided, use pycytominer.annotate for metadata
    if platemap is not None:
        sc_df = annotate(
            profiles=sc_df, platemap=platemap, output_file=None, **kwargs
        )

    # if output argument is provided, call it using df_merged_sc and kwargs
    if sc_output_file is not None:
        return output(
            df=sc_df,
            output_filename=sc_output_file,
            compression_options=compression_options,
            float_format=float_format,
            **kwargs,
        )
    else:
        return sc_df

set_output_file(output_file)

Set or modify output file.

Parameters:

Name Type Description Default
output_file str

New output file name.

required

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def set_output_file(self, output_file):
    """Set or modify output file.

    Parameters
    ----------
    output_file : str
        New output file name.

    Returns
    -------
    None
        Nothing is returned.
    """
    self.output_file = output_file

set_subsample_frac(subsample_frac)

Set or update the subsample fraction.

Parameters:

Name Type Description Default
subsample_frac float

Percentage of single cells to select (0 < subsample_frac <= 1).

1

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def set_subsample_frac(self, subsample_frac):
    """Set or update the subsample fraction.

    Parameters
    ----------
    subsample_frac : float, default 1
        Percentage of single cells to select (0 < subsample_frac <= 1).

    Returns
    -------
    None
        Nothing is returned.
    """
    self.subsample_frac = subsample_frac
    self._check_subsampling()

set_subsample_n(subsample_n)

Set or update the subsample n.

Parameters:

Name Type Description Default
subsample_n int

Indicate how many sample to subsample - do not specify both subsample_frac and subsample_n.

"all"

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def set_subsample_n(self, subsample_n):
    """Set or update the subsample n.

    Parameters
    ----------
    subsample_n : int, default "all"
        Indicate how many sample to subsample - do not specify both subsample_frac and subsample_n.

    Returns
    -------
    None
        Nothing is returned.
    """
    try:
        self.subsample_n = int(subsample_n)
    except ValueError:
        raise ValueError("subsample n must be an integer or coercable")
    self._check_subsampling()

set_subsample_random_state(random_state)

Set or update the subsample random state.

Parameters:

Name Type Description Default
random_state

The random state to init subsample.

required

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/cells.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def set_subsample_random_state(self, random_state):
    """Set or update the subsample random state.

    Parameters
    ----------
    random_state: int, optional
        The random state to init subsample.

    Returns
    -------
    None
        Nothing is returned.
    """
    self.subsampling_random_state = random_state

split_column_categories(col_names)

Split a list of column names into feature and metadata columns lists.

Source code in pycytominer/cyto_utils/cells.py
426
427
428
429
430
431
432
433
434
435
436
def split_column_categories(self, col_names):
    """Split a list of column names into feature and metadata columns lists."""
    feat_cols = []
    meta_cols = []
    for col in col_names:
        if col.lower().startswith(tuple(self.compartments)):
            feat_cols.append(col)
        else:
            meta_cols.append(col)

    return meta_cols, feat_cols

subsample_profiles(df, rename_col=True)

Sample a Pandas DataFrame given subsampling information.

Parameters:

Name Type Description Default
df DataFrame

DataFrame of a single cell profile.

required
rename_col bool

Whether or not to rename the columns.

True

Returns:

Type Description
DataFrame

A subsampled pandas dataframe of single cell profiles.

Source code in pycytominer/cyto_utils/cells.py
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def subsample_profiles(self, df, rename_col=True):
    """Sample a Pandas DataFrame given subsampling information.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame of a single cell profile.
    rename_col : bool, default True
        Whether or not to rename the columns.

    Returns
    -------
    pandas.core.frame.DataFrame
        A subsampled pandas dataframe of single cell profiles.
    """
    if self.subsampling_random_state is None:
        random_state = np.random.randint(0, 10000, size=1)[0]
        self.set_subsample_random_state(random_state)

    if self.subsample_frac == 1:
        output_df = pd.DataFrame.sample(
            df,
            n=self.subsample_n,
            replace=True,
            random_state=self.subsampling_random_state,
        )
    else:
        output_df = pd.DataFrame.sample(
            df, frac=self.subsample_frac, random_state=self.subsampling_random_state
        )

    if rename_col:
        output_df = output_df.rename(self.linking_col_rename, axis="columns")

    return output_df

pycytominer.cyto_utils.collate

Module that provides functions for collating CellProfiler-created CSVs into a single SQLite file.

collate(batch, config, plate, base_directory='../..', column=None, munge=False, csv_dir='analysis', aws_remote=None, aggregate_only=False, tmp_dir='/tmp', overwrite=False, add_image_features=True, image_feature_categories=['Granularity', 'Texture', 'ImageQuality', 'Threshold'], printtoscreen=True)

Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database.

Parameters:

Name Type Description Default
batch str

Batch name to process

required
config str

Config file to pass to cytominer-database

required
plate str

Plate name to process

required
base_directory str

Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory

"../.."
column str

An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists

None
munge bool

Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table

False
csv_dir str

The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"

'analysis'
aws_remote str

A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run

None
aggregate_only bool

Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps

False
tmp_dir

The temporary directory to be used by cytominer-databases for output

'/tmp'
overwrite

Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists

False
add_image_features

Whether or not to add the image features to the profiles

True
image_feature_categories

The list of image feature groups to be used by add_image_features during aggregation

['Granularity', 'Texture', 'ImageQuality', 'Threshold']
printtoscreen

Whether or not to print output to the terminal

True
Source code in pycytominer/cyto_utils/collate.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def collate(
    batch,
    config,
    plate,
    base_directory="../..",
    column=None,
    munge=False,
    csv_dir="analysis",
    aws_remote=None,
    aggregate_only=False,
    tmp_dir="/tmp",  # noqa: S108
    overwrite=False,
    add_image_features=True,
    image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"],
    printtoscreen=True,
):
    """Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database.

    Parameters
    ----------
    batch : str
        Batch name to process
    config : str
        Config file to pass to cytominer-database
    plate : str
        Plate name to process
    base_directory : str, default "../.."
        Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory
    column : str, optional, default None
        An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists
    munge : bool, default False
        Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table
    csv_dir : str, default 'analysis'
        The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"
    aws_remote : str, optional, default None
        A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
    aggregate_only : bool, default False
        Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps
    tmp_dir: str, default '/tmp'
        The temporary directory to be used by cytominer-databases for output
    overwrite: bool, optional, default False
        Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists
    add_image_features: bool, optional, default True
        Whether or not to add the image features to the profiles
    image_feature_categories: list, optional, default ['Granularity','Texture','ImageQuality','Count','Threshold']
        The list of image feature groups to be used by add_image_features during aggregation
    printtoscreen: bool, optional, default True
        Whether or not to print output to the terminal
    """
    from pycytominer.cyto_utils.cells import SingleCells

    # Check if optional dependency cytominer-database is installed
    try:
        import cytominer_database.ingest
        import cytominer_database.munge
    except ImportError:
        raise ImportError(
            """Optional dependency cytominer-database is not installed.
            Please install the `collate` optional dependency group: e.g. `pip install pycytominer[collate]`
            """
        )

    # Set up directories (these need to be abspaths to keep from confusing makedirs later)
    input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}")
    backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}")
    cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}")

    aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv")
    backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite")
    cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite")

    if not aggregate_only:
        if os.path.exists(cache_backend_file):
            if not overwrite:
                sys.exit(
                    f"An SQLite file for {plate} already exists at {cache_backend_file} and overwrite is set to False. Terminating."
                )
            else:
                os.remove(cache_backend_file)

        for eachdir in [input_dir, backend_dir, cache_backend_dir]:
            if not os.path.exists(eachdir):
                os.makedirs(eachdir, exist_ok=True)

        if aws_remote:
            remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}"

            remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"

            remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

            sync_cmd = f"aws s3 sync --exclude * --include */Cells.csv --include */Nuclei.csv --include */Cytoplasm.csv --include */Image.csv {remote_input_dir} {input_dir}"
            if printtoscreen:
                print(f"Downloading CSVs from {remote_input_dir} to {input_dir}")
            run_check_errors(sync_cmd)

        if printtoscreen:
            print(f"Ingesting {input_dir}")
        # Run cytominer-database ingest
        if munge:
            cytominer_database.munge.munge(config_path=config, source=input_dir)

        cytominer_database.ingest.seed(
            source=input_dir,
            target=f"sqlite:///{cache_backend_file}",
            config_file=config,
        )

        # Create a sqlite3 connection
        with sqlite3.connect(cache_backend_file, isolation_level=None) as connection:
            cursor = connection.cursor()
            if column:
                if print:
                    print(f"Adding a Metadata_Plate column based on column {column}")
                cursor.execute("ALTER TABLE Image ADD COLUMN Metadata_Plate TEXT;")
                cursor.execute(f"UPDATE image SET Metadata_Plate ={column};")

            if printtoscreen:
                print(f"Indexing database {cache_backend_file}")
            cursor.execute(
                "CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);"
            )
            for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]:
                cursor.execute(
                    f"""CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx
                                ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);"""
                )
            cursor.execute(
                "CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);"
            )
            cursor.close()
        connection.close()

        if aws_remote:
            if printtoscreen:
                print(f"Uploading {cache_backend_file} to {remote_backend_file}")
            cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file]
            run_check_errors(cp_cmd)

            if printtoscreen:
                print(
                    f"Removing analysis files from {input_dir} and {cache_backend_dir}"
                )
            import shutil

            shutil.rmtree(input_dir)

        if printtoscreen:
            print(f"Renaming {cache_backend_file} to {backend_file}")
        os.rename(cache_backend_file, backend_file)

    if printtoscreen:
        print(f"Aggregating sqlite:///{backend_file}")

    if aggregate_only and aws_remote:
        remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"

        remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

        cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file]
        if printtoscreen:
            print(
                f"Downloading SQLite files from {remote_backend_file} to {backend_file}"
            )
        run_check_errors(cp_cmd)

    if not os.path.exists(backend_file):
        sys.exit(f"{backend_file} does not exist. Exiting.")

    if add_image_features:
        pass
    else:
        image_feature_categories = None  # defensive but not sure what will happen if we give a list but set to False

    database = SingleCells(
        f"sqlite:///{backend_file}",
        aggregation_operation="mean",
        add_image_features=add_image_features,
        image_feature_categories=image_feature_categories,
    )
    database.aggregate_profiles(output_file=aggregated_file)

    if aws_remote:
        if printtoscreen:
            print(f"Uploading {aggregated_file} to {remote_aggregated_file}")
        csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file]
        run_check_errors(csv_cp_cmd)

        if printtoscreen:
            print(f"Removing backend files from {backend_dir}")
        import shutil

        shutil.rmtree(backend_dir)

run_check_errors(cmd)

Run a system command, and exit if an error occurred, otherwise continue.

Source code in pycytominer/cyto_utils/collate.py
10
11
12
13
14
15
16
17
18
19
20
def run_check_errors(cmd):
    """Run a system command, and exit if an error occurred, otherwise continue."""
    if isinstance(cmd, str):
        cmd = cmd.split()
    output = subprocess.run(cmd, capture_output=True, text=True)  # noqa: S603
    if output.stderr != "":
        print_cmd = " ".join(map(str, cmd))
        sys.exit(
            f"The error {output.stderr} was generated when running {print_cmd}. Exiting."
        )
    return

pycytominer.cyto_utils.collate_cmd

Command line interface for collate function in pycytominer.cyto_utils.collate.

pycytominer.cyto_utils.cp_image_features

Functions for counting the number of fields and aggregating other images features.

aggregate_fields_count(image_df, strata, fields_of_view_feature)

Compute the number of fields per well and create a new column called Metadata_Site_Count.

Parameters:

Name Type Description Default
image_df DataFrame

Image table dataframe which includes the strata and fields of view feature as columns.

required
strata list of str

The columns to groupby and aggregate single cells.

required
fields_of_view_feature

Name of the fields of the view column.

required

Returns:

Name Type Description
fields_count_df DataFrame

DataFrame with the Metadata_Site_Count column.

Source code in pycytominer/cyto_utils/cp_image_features.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def aggregate_fields_count(image_df, strata, fields_of_view_feature):
    """Compute the number of fields per well and create a new column called Metadata_Site_Count.

    Parameters
    ----------
    image_df : pandas.core.frame.DataFrame
        Image table dataframe which includes the strata and fields of view feature as columns.
    strata :  list of str
        The columns to groupby and aggregate single cells.
    fields_of_view_feature: str
        Name of the fields of the view column.

    Returns
    -------
    fields_count_df: pandas.core.frame.DataFrame
        DataFrame with the Metadata_Site_Count column.

    """
    fields_count_df = image_df.loc[:, list(np.union1d(strata, fields_of_view_feature))]

    fields_count_df = (
        fields_count_df.groupby(strata)[fields_of_view_feature]
        .count()
        .reset_index()
        .rename(columns={f"{fields_of_view_feature}": "Metadata_Site_Count"})
    )

    return fields_count_df

aggregate_image_count_features(df, image_features_df, image_cols, strata, count_prefix='Count')

Aggregate the Count features in the Image table.

Parameters:

Name Type Description Default
df DataFrame

Dataframe of aggregated profiles.

required
image_features_df DataFrame

Image table dataframe with Count features

required
image_cols list of str

Columns to select from the image table.

required
strata list of str

The columns to groupby and aggregate single cells.

required
count_prefix str

Prefix of the count columns in the image table.

"Count"

Returns:

Name Type Description
df DataFrame

DataFrame with aggregated Count features in the Image table.

remove_cols list of str

Columns to remove from the image table before aggregating using aggregate_image_features()

Source code in pycytominer/cyto_utils/cp_image_features.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def aggregate_image_count_features(
    df, image_features_df, image_cols, strata, count_prefix="Count"
):
    """Aggregate the Count features in the Image table.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        Dataframe of aggregated profiles.
    image_features_df : pandas.core.frame.DataFrame
        Image table dataframe with Count features
    image_cols : list of str
        Columns to select from the image table.
    strata :  list of str
        The columns to groupby and aggregate single cells.
    count_prefix : str, default "Count"
        Prefix of the count columns in the image table.

    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame with aggregated Count features in the Image table.
    remove_cols : list of str
        Columns to remove from the image table before aggregating using aggregate_image_features()
    """
    count_features = list(
        image_features_df.columns[
            image_features_df.columns.str.startswith("Metadata_" + str(count_prefix))
        ]
    )

    remove_cols = list(np.union1d(image_cols, count_features))
    keep_cols = list(np.union1d(strata, count_features))
    count_df = image_features_df[keep_cols].copy()
    count_df = count_df.groupby(strata, dropna=False).sum().reset_index()
    df = df.merge(count_df, on=strata, how="left")

    return df, remove_cols

aggregate_image_features(df, image_features_df, image_feature_categories, image_cols, strata, aggregation_operation, count_prefix='Count')

Aggregate the non-Count image features.

Parameters:

Name Type Description Default
df DataFrame

Dataframe of aggregated profiles.

required
image_features_df DataFrame

Image table dataframe with all the image_feature_category features.

required
image_feature_categories list of str

List of categories of features from the image table to add to the profiles.

required
image_cols list of str

Columns to select from the image table.

required
strata list of str

The columns to groupby and aggregate single cells.

required
aggregation_operation str

Operation to perform image table feature aggregation.

required
count_prefix str

Prefix of the count columns in the image table.

"Count"

Returns:

Name Type Description
df DataFrame

DataFrame of aggregated image features.

Source code in pycytominer/cyto_utils/cp_image_features.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def aggregate_image_features(
    df,
    image_features_df,
    image_feature_categories,
    image_cols,
    strata,
    aggregation_operation,
    count_prefix="Count",
):
    """Aggregate the non-Count image features.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        Dataframe of aggregated profiles.
    image_features_df : pandas.core.frame.DataFrame
        Image table dataframe with all the image_feature_category features.
    image_feature_categories : list of str
        List of categories of features from the image table to add to the profiles.
    image_cols : list of str
        Columns to select from the image table.
    strata :  list of str
        The columns to groupby and aggregate single cells.
    aggregation_operation : str
        Operation to perform image table feature aggregation.
    count_prefix : str, default "Count"
        Prefix of the count columns in the image table.

    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame of aggregated image features.

    """
    # Aggregate image count features
    if count_prefix in image_feature_categories:
        df, remove_cols = aggregate_image_count_features(
            df, image_features_df, image_cols, strata
        )
    else:
        remove_cols = list(image_cols) + list(
            image_features_df.columns[
                image_features_df.columns.str.startswith(f"Metadata_{count_prefix}")
            ]
        )

    # Aggregate other image features
    if len(np.setdiff1d(image_feature_categories, [count_prefix])) != 0:
        image_features_df = image_features_df.drop(
            remove_cols, axis="columns", errors="ignore"
        )
        features = list(np.setdiff1d(list(image_features_df.columns), strata))
        image_features_df = aggregate.aggregate(
            population_df=image_features_df,
            strata=strata,
            features=features,
            operation=aggregation_operation,
        )

        df = df.merge(image_features_df, on=strata, how="left")

    return df

pycytominer.cyto_utils.features

Utility function to manipulate cell profiler features.

convert_compartment_format_to_list(compartments)

Convert cell painting compartments to a list.

Parameters:

Name Type Description Default
compartments list of str or str

Cell Painting compartment(s).

required

Returns:

Name Type Description
compartments list of str

List of Cell Painting compartments.

Source code in pycytominer/cyto_utils/features.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def convert_compartment_format_to_list(compartments):
    """Convert cell painting compartments to a list.

    Parameters
    ----------
    compartments : list of str or str
        Cell Painting compartment(s).

    Returns
    -------
    compartments : list of str
        List of Cell Painting compartments.
    """
    if isinstance(compartments, list):
        compartments = [x.lower() for x in compartments]
    elif isinstance(compartments, str):
        compartments = [compartments.lower()]

    return compartments

count_na_features(population_df, features)

Given a population dataframe and features, count how many nas per feature.

Parameters:

Name Type Description Default
population_df DataFrame

DataFrame of profiles.

required
features list of str

Features present in the population dataframe.

required

Returns:

Type Description
Dataframe of NA counts per feature
Source code in pycytominer/cyto_utils/features.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def count_na_features(population_df, features):
    """Given a population dataframe and features, count how many nas per feature.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame of profiles.
    features : list of str
        Features present in the population dataframe.

    Returns
    -------
    Dataframe of NA counts per feature
    """
    return pd.DataFrame(population_df.loc[:, features].isna().sum(), columns=["num_na"])

drop_outlier_features(population_df, features='infer', samples='all', outlier_cutoff=500)

Exclude a feature if its min or max absolute value is greater than the threshold.

Parameters:

Name Type Description Default
population_df DataFrame

DataFrame that includes metadata and observation features.

required
features list of str or str

Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"

"infer"
samples str

List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is "Metadata_treatment == 'control'" (include all quotes). If "all", use all samples to calculate.

"all"
outlier_cutoff int or float

see https://github.com/cytomining/pycytominer/issues/237 for details. Threshold to remove features if absolute values is greater

500

Returns:

Name Type Description
outlier_features list of str

Features greater than the threshold.

Source code in pycytominer/cyto_utils/features.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def drop_outlier_features(
    population_df, features="infer", samples="all", outlier_cutoff=500
):
    """Exclude a feature if its min or max absolute value is greater than the threshold.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame that includes metadata and observation features.
    features : list of str or str, default "infer"
        Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"
    samples : str, default "all"
        List of samples to perform operation on. The function uses a pd.DataFrame.query()
        function, so you should  structure samples in this fashion. An example is
        "Metadata_treatment == 'control'" (include all quotes).
        If "all", use all samples to calculate.
    outlier_cutoff : int or float, default 500
        see https://github.com/cytomining/pycytominer/issues/237 for details.
        Threshold to remove features if absolute values is greater

    Returns
    -------
    outlier_features: list of str
        Features greater than the threshold.
    """
    # Subset dataframe
    if samples != "all":
        population_df.query(samples, inplace=True)

    if features == "infer":
        features = infer_cp_features(population_df)
        population_df = population_df.loc[:, features]
    else:
        population_df = population_df.loc[:, features]

    max_feature_values = population_df.max().abs()
    min_feature_values = population_df.min().abs()

    outlier_features = max_feature_values[
        (max_feature_values > outlier_cutoff) | (min_feature_values > outlier_cutoff)
    ].index.tolist()

    return outlier_features

get_blocklist_features(blocklist_file=blocklist_file, population_df=None)

Get a list of blocklist features.

Parameters:

Name Type Description Default
blocklist_file path-like object

Location of the dataframe with features to exclude.

blocklist_file
population_df DataFrame

Profile dataframe used to subset blocklist features.

None

Returns:

Name Type Description
blocklist_features list of str

Features to exclude from downstream analysis.

Source code in pycytominer/cyto_utils/features.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def get_blocklist_features(blocklist_file=blocklist_file, population_df=None):
    """Get a list of blocklist features.

    Parameters
    ----------
    blocklist_file : path-like object
        Location of the dataframe with features to exclude.
    population_df : pandas.core.frame.DataFrame, optional
        Profile dataframe used to subset blocklist features.

    Returns
    -------
    blocklist_features : list of str
        Features to exclude from downstream analysis.
    """
    blocklist = pd.read_csv(blocklist_file)

    assert any(  # noqa: S101
        x == "blocklist" for x in blocklist.columns
    ), "one column must be named 'blocklist'"

    blocklist_features = blocklist.blocklist.to_list()
    if isinstance(population_df, pd.DataFrame):
        population_features = population_df.columns.tolist()
        blocklist_features = [x for x in blocklist_features if x in population_features]

    return blocklist_features

infer_cp_features(population_df, compartments=['Cells', 'Nuclei', 'Cytoplasm'], metadata=False, image_features=False)

Given a dataframe, output features that we expect to be Cell Painting features.

Parameters:

Name Type Description Default
population_df DataFrame

DataFrame from which features are to be inferred.

required
compartments list of str

Compartments from which Cell Painting features were extracted.

["Cells", "Nuclei", "Cytoplasm"]
metadata bool

Whether or not to infer metadata features.

False
image_features bool

Whether or not the profiles contain image features.

False

Returns:

Name Type Description
features list of str

List of Cell Painting features.

Source code in pycytominer/cyto_utils/features.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def infer_cp_features(
    population_df,
    compartments=["Cells", "Nuclei", "Cytoplasm"],
    metadata=False,
    image_features=False,
):
    """Given a dataframe, output features that we expect to be Cell Painting features.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame from which features are to be inferred.
    compartments : list of str, default ["Cells", "Nuclei", "Cytoplasm"]
        Compartments from which Cell Painting features were extracted.
    metadata : bool, default False
        Whether or not to infer metadata features.
    image_features : bool, default False
        Whether or not the profiles contain image features.

    Returns
    -------
    features: list of str
        List of Cell Painting features.
    """
    compartments = convert_compartment_format_to_list(compartments)
    compartments = [x.title() for x in compartments]

    if image_features:
        compartments = list({"Image", *compartments})

    features = []
    for col in population_df.columns.tolist():
        if any(col.startswith(x.title()) for x in compartments):
            features.append(col)

    if metadata:
        features = population_df.columns[
            population_df.columns.str.startswith("Metadata_")
        ].tolist()

    assert (  # noqa: S101
        len(features) > 0
    ), "No CP features found. Are you sure this dataframe is from CellProfiler?"

    return features

label_compartment(cp_features, compartment, metadata_cols)

Assign compartment label to each features as a prefix.

Parameters:

Name Type Description Default
cp_features list of str

All features being used.

required
compartment str

Measured compartment.

required
metadata_cols list

Columns that should be considered metadata.

required

Returns:

Name Type Description
cp_features list of str

Recoded column names with appropriate metadata and compartment labels.

Source code in pycytominer/cyto_utils/features.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def label_compartment(cp_features, compartment, metadata_cols):
    """Assign compartment label to each features as a prefix.

    Parameters
    ----------
    cp_features : list of str
        All features being used.
    compartment : str
        Measured compartment.
    metadata_cols : list
        Columns that should be considered metadata.

    Returns
    -------
    cp_features: list of str
        Recoded column names with appropriate metadata and compartment labels.
    """
    compartment = compartment.Title()
    avail_compartments = ["Cells", "Cytoplasm", "Nuceli", "Image", "Barcode"]

    assert (  # noqa: S101
        compartment in avail_compartments
    ), f"provide valid compartment. One of: {avail_compartments}"

    cp_features = [
        f"Metadata_{x}" if x in metadata_cols else f"{compartment}_{x}"
        for x in cp_features
    ]

    return cp_features

pycytominer.cyto_utils.load

Module for loading data from various file formats.

infer_delim(file)

Sniff the delimiter in the given file.

Parameters:

Name Type Description Default
file str

File name

required
Return

the delimiter used in the dataframe (typically either tab or commas)

Source code in pycytominer/cyto_utils/load.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def infer_delim(file: str):
    """
    Sniff the delimiter in the given file.

    Parameters
    ----------
    file : str
        File name

    Return
    ------
    the delimiter used in the dataframe (typically either tab or commas)
    """
    try:
        with open(file) as csvfile:
            line = csvfile.readline()
    except UnicodeDecodeError:
        with gzip.open(file, "r") as gzipfile:
            line = gzipfile.readline().decode()

    dialect = csv.Sniffer().sniff(line)

    return dialect.delimiter

is_path_a_parquet_file(file)

Check if the provided file path is a parquet file.

Identify parquet files by inspecting the file extensions. If the file does not end with parquet, this will return False, else True.

Parameters:

Name Type Description Default
file Union[str, PurePath]

path to parquet file

required

Returns:

Type Description
bool

Returns True if the file path contains .parquet, else it will return False

Raises:

Type Description
TypeError

Raised if a non str or non-path object is passed in the file parameter

FileNotFoundError

Raised if the provided path in the file does not exist

Source code in pycytominer/cyto_utils/load.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool:
    """Check if the provided file path is a parquet file.

    Identify parquet files by inspecting the file extensions.
    If the file does not end with `parquet`, this will return False, else True.

    Parameters
    ----------
    file : Union[str, pathlib.PurePath]
        path to parquet file

    Returns
    -------
    bool
        Returns True if the file path contains `.parquet`, else it will return
        False

    Raises
    ------
    TypeError
        Raised if a non str or non-path object is passed in the `file` parameter
    FileNotFoundError
        Raised if the provided path in the `file` does not exist
    """
    file = pathlib.PurePath(file)
    try:
        # strict=true tests if path exists
        file = pathlib.Path(file).resolve(strict=True)
    except FileNotFoundError as e:
        print("load_profiles() didn't find the path.", e, sep="\n")

    # Check if file path is a parquet file
    if file.suffix.lower() == ".parquet":
        return True

    return False

load_npz_features(npz_file, fallback_feature_prefix='DP', metadata=True)

Load an npz file storing features and, sometimes, metadata.

The function will first search the .npz file for a metadata column called "Metadata_Model". If the field exists, the function uses this entry as the feature prefix. If it doesn't exist, use the fallback_feature_prefix.

If the npz file does not exist, this function returns an empty dataframe.

Parameters:

Name Type Description Default
npz_file str

file path to the compressed output (typically DeepProfiler output)

required
fallback_feature_prefix

a string to prefix all features [default: "DP"].

'DP'
Return

df : pandas.core.frame.DataFrame pandas DataFrame of profiles

Source code in pycytominer/cyto_utils/load.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
    """
    Load an npz file storing features and, sometimes, metadata.

    The function will first search the .npz file for a metadata column called
    "Metadata_Model". If the field exists, the function uses this entry as the
    feature prefix. If it doesn't exist, use the fallback_feature_prefix.

    If the npz file does not exist, this function returns an empty dataframe.

    Parameters
    ----------
    npz_file : str
        file path to the compressed output (typically DeepProfiler output)
    fallback_feature_prefix :str
        a string to prefix all features [default: "DP"].

    Return
    ------
    df : pandas.core.frame.DataFrame
        pandas DataFrame of profiles
    """
    try:
        npz = np.load(npz_file, allow_pickle=True)
    except FileNotFoundError:
        return pd.DataFrame([])

    files = npz.files

    # Load features
    df = pd.DataFrame(npz["features"])

    if not metadata:
        return df

    # Load metadata
    if "metadata" in files:
        metadata = npz["metadata"].item()
        metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
        metadata_df.columns = [
            f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
        ]

        # Determine the appropriate metadata prefix
        if "Metadata_Model" in metadata_df.columns:
            feature_prefix = metadata_df.Metadata_Model.unique()[0]
        else:
            feature_prefix = fallback_feature_prefix
    else:
        feature_prefix = fallback_feature_prefix

    # Append feature prefix
    df.columns = [
        f"{feature_prefix}_{x}" if not str(x).startswith(feature_prefix) else x
        for x in df
    ]

    # Append metadata with features
    if "metadata" in files:
        df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)

    return df

load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1)

Load an npz file storing locations and, sometimes, metadata.

The function will first search the .npz file for a metadata column called "locations". If the field exists, the function uses this entry as the feature prefix.

If the npz file does not exist, this function returns an empty dataframe.

Parameters:

Name Type Description Default
npz_file str

file path to the compressed output (typically DeepProfiler output)

required
location_x_col_index

index of the x location column (which column in DP output has X coords)

0
location_y_col_index

index of the y location column (which column in DP output has Y coords)

1
Return

df : pandas.core.frame.DataFrame pandas DataFrame of profiles

Source code in pycytominer/cyto_utils/load.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
    """
    Load an npz file storing locations and, sometimes, metadata.

    The function will first search the .npz file for a metadata column called
    "locations". If the field exists, the function uses this entry as the
    feature prefix.

    If the npz file does not exist, this function returns an empty dataframe.

    Parameters
    ----------
    npz_file : str
        file path to the compressed output (typically DeepProfiler output)
    location_x_col_index: int
        index of the x location column (which column in DP output has X coords)
    location_y_col_index: int
        index of the y location column (which column in DP output has Y coords)

    Return
    ------
    df : pandas.core.frame.DataFrame
        pandas DataFrame of profiles
    """
    try:
        npz = np.load(npz_file, allow_pickle=True)
    except FileNotFoundError:
        return pd.DataFrame([])

    # number of columns with data in the locations file
    num_location_cols = npz["locations"].shape[1]
    # throw error if user tries to index columns that don't exist
    if location_x_col_index >= num_location_cols:
        raise IndexError("OutOfBounds indexing via location_x_col_index")
    if location_y_col_index >= num_location_cols:
        raise IndexError("OutOfBounds indexing via location_y_col_index")

    df = pd.DataFrame(npz["locations"])
    df = df[[location_x_col_index, location_y_col_index]]
    df.columns = ["Location_Center_X", "Location_Center_Y"]
    return df

load_platemap(platemap, add_metadata_id=True)

Unless a dataframe is provided, load the given platemap dataframe from path or string.

Parameters:

Name Type Description Default
platemap pandas dataframe

location or actual pandas dataframe of platemap file

required
add_metadata_id bool

boolean if "Metadata_" should be appended to all platemap columns

True
Return

platemap : pandas.core.frame.DataFrame pandas DataFrame of profiles

Source code in pycytominer/cyto_utils/load.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def load_platemap(platemap, add_metadata_id=True):
    """
    Unless a dataframe is provided, load the given platemap dataframe from path or string.

    Parameters
    ----------
    platemap : pandas dataframe
        location or actual pandas dataframe of platemap file

    add_metadata_id : bool
        boolean if "Metadata_" should be appended to all platemap columns

    Return
    ------
    platemap : pandas.core.frame.DataFrame
        pandas DataFrame of profiles
    """
    if not isinstance(platemap, pd.DataFrame):
        try:
            delim = infer_delim(platemap)
            platemap = pd.read_csv(platemap, sep=delim)
        except FileNotFoundError:
            raise FileNotFoundError(f"{platemap} platemap file not found")
    else:
        # Setting platemap to a copy to prevent column name changes from back-propagating
        platemap = platemap.copy()

    if add_metadata_id:
        platemap.columns = [
            f"Metadata_{x}" if not x.startswith("Metadata_") else x
            for x in platemap.columns
        ]
    return platemap

load_profiles(profiles)

Unless a dataframe is provided, load the given profile dataframe from path or string.

Parameters:

Name Type Description Default
profiles (str, Path, DataFrame)

file location or actual pandas dataframe of profiles

str
Return

pandas DataFrame of profiles

Raises:

Type Description
FileNotFoundError

Raised if the provided profile does not exists

Source code in pycytominer/cyto_utils/load.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def load_profiles(profiles):
    """
    Unless a dataframe is provided, load the given profile dataframe from path or string.

    Parameters
    ----------
    profiles : {str, pathlib.Path, pandas.DataFrame}
        file location or actual pandas dataframe of profiles

    Return
    ------
    pandas DataFrame of profiles

    Raises
    ------
    FileNotFoundError
        Raised if the provided profile does not exists
    """
    if not isinstance(profiles, pd.DataFrame):
        # Check if path exists and load depending on file type
        if is_path_a_parquet_file(profiles):
            return pd.read_parquet(profiles, engine="pyarrow")

        else:
            delim = infer_delim(profiles)
            return pd.read_csv(profiles, sep=delim)

    return profiles

pycytominer.cyto_utils.modz

Module for performing a modified z score transformation.

modz(population_df, replicate_columns, features='infer', method='spearman', min_weight=0.01, precision=4)

Collapse replicates into a consensus signature using a weighted transformation.

Parameters:

Name Type Description Default
population_df DataFrame

DataFrame that includes metadata and observation features.

required
replicate_columns (str, list)

a string or list of column(s) in the population dataframe that indicate replicate level information

required
features list

List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_".

"infer"
method str

indicating which correlation metric to use.

"spearman"
min_weight float

the minimum correlation to clip all non-negative values lower to

0.01
precision int

how many significant digits to round weights to

4

Returns:

Name Type Description
modz_df DataFrame

Consensus signatures with metadata for all replicates in the given DataFrame

Source code in pycytominer/cyto_utils/modz.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def modz(
    population_df,
    replicate_columns,
    features="infer",
    method="spearman",
    min_weight=0.01,
    precision=4,
):
    """Collapse replicates into a consensus signature using a weighted transformation.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame that includes metadata and observation features.
    replicate_columns : str, list
        a string or list of column(s) in the population dataframe that
        indicate replicate level information
    features : list, default "infer"
         List of features present in the population dataframe [default: "infer"]
         if "infer", then assume cell painting features are those that start with
         "Cells_", "Nuclei_", or "Cytoplasm_".
    method : str, default "spearman"
        indicating which correlation metric to use.
    min_weight : float, default 0.01
        the minimum correlation to clip all non-negative values lower to
    precision : int, default 4
        how many significant digits to round weights to

    Returns
    -------
    modz_df : pandas.core.frame.DataFrame
        Consensus signatures with metadata for all replicates in the given DataFrame
    """
    population_features = population_df.columns.tolist()
    assert_error = f"{replicate_columns} not in input dataframe"
    if isinstance(replicate_columns, list):
        assert all(x in population_features for x in replicate_columns), assert_error  # noqa: S101
    elif isinstance(replicate_columns, str):
        assert replicate_columns in population_features, assert_error  # noqa: S101
        replicate_columns = replicate_columns.split()
    else:
        return ValueError("replicate_columns must be a list or string")

    if features == "infer":
        features = infer_cp_features(population_df)

    subset_features = list(set(replicate_columns + features))
    population_df = population_df.loc[:, subset_features]

    modz_df = (
        population_df.groupby(replicate_columns)
        .apply(
            lambda x: modz_base(
                x.loc[:, features],
                method=method,
                min_weight=min_weight,
                precision=precision,
            )
        )
        .reset_index()
    )

    return modz_df

modz_base(population_df, method='spearman', min_weight=0.01, precision=4)

Perform a modified z score transformation.

This code is modified from cmapPy. (see https://github.com/cytomining/pycytominer/issues/52). Note that this will apply the transformation to the FULL population_df. See modz() for replicate level procedures.

Parameters:

Name Type Description Default
population_df DataFrame

DataFrame that includes metadata and observation features.

required
method str

indicating which correlation metric to use.

"spearman"
min_weight float

the minimum correlation to clip all non-negative values lower to

0.01
precision int

how many significant digits to round weights to

4

Returns:

Name Type Description
modz_df DataFrame

modz transformed dataframe - a consensus signature of the input data weighted by replicate correlation

Source code in pycytominer/cyto_utils/modz.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def modz_base(population_df, method="spearman", min_weight=0.01, precision=4):
    """Perform a modified z score transformation.

    This code is modified from cmapPy.
    (see https://github.com/cytomining/pycytominer/issues/52). Note that this will
    apply the transformation to the FULL population_df.
    See modz() for replicate level procedures.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame that includes metadata and observation features.
    method : str, default "spearman"
        indicating which correlation metric to use.
    min_weight : float, default 0.01
        the minimum correlation to clip all non-negative values lower to
    precision : int, default 4
        how many significant digits to round weights to

    Returns
    -------
    modz_df : pandas.core.frame.DataFrame
        modz transformed dataframe - a consensus signature of the input data
        weighted by replicate correlation
    """
    assert population_df.shape[0] > 0, "population_df must include at least one sample"  # noqa: S101

    method = check_correlation_method(method=method)

    # Step 1: Extract pairwise correlations of samples
    # Transpose so samples are columns
    population_df = population_df.transpose()
    cor_df, pair_df = get_pairwise_correlation(population_df, method=method)

    # Round correlation results
    pair_df = pair_df.round(precision)

    # create a copy of cor_df values for use with np.fill_diagonal
    cor_df_values = cor_df.values.copy()

    # Step 2: Identify sample weights
    # Fill diagonal of correlation_matrix with np.nan
    np.fill_diagonal(cor_df_values, np.nan)

    # reconstitute the changed data as a new dataframe to avoid read-only behavior
    cor_df = pd.DataFrame(
        data=cor_df_values, index=cor_df.index, columns=cor_df.columns
    )

    # Remove negative values
    cor_df = cor_df.clip(lower=0)

    # Get average correlation for each profile (will ignore NaN)
    raw_weights = cor_df.mean(axis=1)

    # Threshold weights (any value < min_weight will become min_weight)
    raw_weights = raw_weights.clip(lower=min_weight)

    # normalize raw_weights so that they add to 1
    weights = raw_weights / sum(raw_weights)
    weights = weights.round(precision)

    # Step 3: Normalize
    if population_df.shape[1] == 1:
        # There is only one sample (note that columns are now samples)
        modz_df = population_df.sum(axis=1)
    else:
        modz_df = population_df * weights
        modz_df = modz_df.sum(axis=1)

    return modz_df

pycytominer.cyto_utils.output

Utility function to compress output data.

check_compression_method(compression)

Ensure compression options are set properly.

Parameters:

Name Type Description Default
compression str

The category of compression options available

required

Returns:

Type Description
None

Asserts available options

Source code in pycytominer/cyto_utils/output.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def check_compression_method(compression: str):
    """Ensure compression options are set properly.

    Parameters
    ----------
    compression : str
        The category of compression options available

    Returns
    -------
    None
        Asserts available options
    """
    assert (  # noqa: S101
        compression in COMPRESS_OPTIONS
    ), f"{compression} is not supported, select one of {COMPRESS_OPTIONS}"

output(df, output_filename, output_type='csv', sep=',', float_format=None, compression_options={'method': 'gzip', 'mtime': 1}, **kwargs)

Given an output file and compression options, write file to disk.

Parameters:

Name Type Description Default
df pandas.core.frame.DataFrame

a pandas dataframe that will be written to file

required
output_filename str

location of file to write

required
output_type str

type of output file to create

"csv"
sep str

file delimiter

','
float_format str

Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision.

None
compression_options str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

{"method": "gzip", "mtime": 1}

Returns:

Type Description
str

returns output_filename

Examples:

import pandas as pd from pycytominer.cyto_utils import output

data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True)

output_file = "test.csv.gz" output( df=data_df, output_filename=output_file, sep=",", compression_options={"method": "gzip", "mtime": 1}, float_format=None, )

Source code in pycytominer/cyto_utils/output.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def output(
    df: pd.DataFrame,
    output_filename: str,
    output_type: str = "csv",
    sep: str = ",",
    float_format: Optional[str] = None,
    compression_options: Union[str, Dict] = {"method": "gzip", "mtime": 1},
    **kwargs,
):
    """Given an output file and compression options, write file to disk.

    Parameters
    ----------
    df :  pandas.core.frame.DataFrame
        a pandas dataframe that will be written to file
    output_filename : str
        location of file to write
    output_type : str, default "csv"
        type of output file to create
    sep : str
        file delimiter
    float_format : str, default None
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    compression_options : str or dict, default {"method": "gzip", "mtime": 1}
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

    Returns
    -------
    str
        returns output_filename

    Examples
    --------
    import pandas as pd
    from pycytominer.cyto_utils import output

    data_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "a",
                    "Cells_x": [0.1, 0.3, 0.8],
                    "Nuclei_y": [0.5, 0.3, 0.1],
                }
            ),
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "b",
                    "Cells_x": [0.4, 0.2, -0.5],
                    "Nuclei_y": [-0.8, 1.2, -0.5],
                }
            ),
        ]
    ).reset_index(drop=True)

    output_file = "test.csv.gz"
    output(
        df=data_df,
        output_filename=output_file,
        sep=",",
        compression_options={"method": "gzip", "mtime": 1},
        float_format=None,
    )
    """
    if output_type == "csv":
        compression_options = set_compression_method(compression=compression_options)

        df.to_csv(
            path_or_buf=output_filename,
            sep=sep,
            index=False,
            float_format=float_format,
            compression=compression_options,
        )

    elif output_type == "parquet":
        # note: compression options will be validated against pd.DataFrame.to_parquet options
        # raising errors and tested through Pandas, PyArrow, etc. as necessary.
        df.to_parquet(path=output_filename, compression="snappy")

    return output_filename

set_compression_method(compression)

Set the compression options.

Parameters:

Name Type Description Default
compression str or dict

Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

required

Returns:

Type Description
(compression, dict)

A formated dictionary expected by output()

Source code in pycytominer/cyto_utils/output.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def set_compression_method(compression: Union[str, Dict]):
    """Set the compression options.

    Parameters
    ----------
    compression : str or dict
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

    Returns
    -------
    compression, dict
        A formated dictionary expected by output()
    """
    if compression is None:
        compression = {"method": None}

    if isinstance(compression, str):
        compression = {"method": compression}

    check_compression_method(compression["method"])
    return compression

pycytominer.cyto_utils.single_cell_ingest_utils

Utility functions for single cell ingest.

assert_linking_cols_complete(linking_cols='default', compartments='default')

Confirm that the linking cols and compartments are compatible.

Parameters:

Name Type Description Default
linking_cols str or dict

Specify how to link objects

"default"
compartments str or list

Which compartments used in the experiment.

"default"

Returns:

Type Description
None

Asserts linking columns are appropriately defined

.. note::

assert_linking_cols_complete() does not check if columns are present

Source code in pycytominer/cyto_utils/single_cell_ingest_utils.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def assert_linking_cols_complete(linking_cols="default", compartments="default"):
    """Confirm that the linking cols and compartments are compatible.

    Parameters
    ----------
    linking_cols : str or dict, default "default"
        Specify how to link objects
    compartments : str or list, default "default"
        Which compartments used in the experiment.

    Returns
    -------
    None
        Asserts linking columns are appropriately defined

    .. note::
        assert_linking_cols_complete() does not check if columns are present
    """
    if linking_cols == "default":
        linking_cols = get_default_linking_cols()

    if compartments == "default":
        compartments = get_default_compartments()

    comp_err = "compartment not found. Check the specified compartments"

    linking_check = []
    unique_linking_cols = []
    for x in linking_cols:
        unique_linking_cols.append(x)
        assert x in compartments, f"{x} {comp_err}"  # noqa: S101
        for y in linking_cols[x]:
            unique_linking_cols.append(y)
            assert y in compartments, f"{y} {comp_err}"  # noqa: S101
            linking_check.append("-".join(sorted([x, y])))

    # Make sure that each combination has been specified exactly twice
    linking_counter = Counter(linking_check)
    for combo in linking_counter:
        assert linking_counter[combo] == 2, f"Missing column identifier between {combo}"  # noqa: S101

    # Confirm that every compartment has been specified in the linking_cols
    unique_linking_cols = sorted(set(unique_linking_cols))
    diff_column = set(compartments).difference(unique_linking_cols)
    assert (  # noqa: S101
        unique_linking_cols == sorted(compartments)
    ), f"All compartments must be specified in the linking_cols, {diff_column} is missing"

get_default_linking_cols()

Define the standard experiment linking columns between tables.

Returns:

Type Description
(linking_cols, dict)

A dictionary mapping columns that links together CellProfiler objects

.. note::

every dictionary pair has a 1 to 1 correspondence (e.g. cytoplasm-cells and cells-cytoplasm both must exist)

Source code in pycytominer/cyto_utils/single_cell_ingest_utils.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def get_default_linking_cols():
    """Define the standard experiment linking columns between tables.

    Returns
    -------
    linking_cols, dict
        A dictionary mapping columns that links together CellProfiler objects

    .. note::
        every dictionary pair has a 1 to 1 correspondence (e.g. cytoplasm-cells and cells-cytoplasm both must exist)
    """
    linking_cols = {
        "cytoplasm": {
            "cells": "Cytoplasm_Parent_Cells",
            "nuclei": "Cytoplasm_Parent_Nuclei",
        },
        "cells": {"cytoplasm": "ObjectNumber"},
        "nuclei": {"cytoplasm": "ObjectNumber"},
    }

    return linking_cols

provide_linking_cols_feature_name_update(linking_cols='default')

Output a dictionary to use to update pandas dataframe column names from linking cols in the Metadata.

Parameters:

Name Type Description Default
linking_cols str or dict

Specify how to link objects

"default"

Returns:

Type Description
(update_name, dict)

Dictionary of the linking column names to update after they are used

Source code in pycytominer/cyto_utils/single_cell_ingest_utils.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def provide_linking_cols_feature_name_update(linking_cols="default"):
    """Output a dictionary to use to update pandas dataframe column names from linking cols in the Metadata.

    Parameters
    ----------
    linking_cols : str or dict, default "default"
        Specify how to link objects

    Returns
    -------
    update_name, dict
        Dictionary of the linking column names to update after they are used
    """
    if linking_cols == "default":
        linking_cols = get_default_linking_cols()

    metadata_update_cols = []
    for col in linking_cols:
        for right_col in linking_cols[col]:
            metadata_update_cols.append(linking_cols[col][right_col])

    update_name = dict(
        zip(
            metadata_update_cols,
            [f"Metadata_{y}" for y in metadata_update_cols],
        )
    )
    return update_name

pycytominer.cyto_utils.util

Miscellaneous utility functions.

check_aggregate_operation(operation)

Confirm that the input operation for aggregation is currently supported.

Parameters:

Name Type Description Default
operation str

Aggregation operation to provide.

required

Returns:

Type Description
str

Correctly formatted operation method.

Source code in pycytominer/cyto_utils/util.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def check_aggregate_operation(operation):
    """Confirm that the input operation for aggregation is currently supported.

    Parameters
    ----------
    operation : str
        Aggregation operation to provide.

    Returns
    -------
    str
        Correctly formatted operation method.

    """
    operation = operation.lower()
    avail_ops = ["mean", "median"]
    assert (  # noqa: S101
        operation in avail_ops
    ), f"operation {operation} not supported, select one of {avail_ops}"

    return operation

check_compartments(compartments)

Check if the input compartments are noncanonical compartments.

Parameters:

Name Type Description Default
compartments list of str

Input compartments.

required

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/util.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def check_compartments(compartments):
    """Check if the input compartments are noncanonical compartments.

    Parameters
    ----------
    compartments : list of str
        Input compartments.

    Returns
    -------
    None
        Nothing is returned.

    """
    default_compartments = get_default_compartments()

    compartments = convert_compartment_format_to_list(compartments)

    non_canonical_compartments = []
    for compartment in compartments:
        if compartment not in default_compartments:
            non_canonical_compartments.append(compartment)

    if len(non_canonical_compartments) > 0:
        warn_str = "Non-canonical compartment detected: {x}".format(
            x=", ".join(non_canonical_compartments)
        )
        warnings.warn(warn_str)

check_consensus_operation(operation)

Confirm that the input operation for consensus is currently supported.

Parameters:

Name Type Description Default
operation

Consensus operation to provide.

required

Returns:

Type Description
str

Correctly formatted operation method.

Source code in pycytominer/cyto_utils/util.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def check_consensus_operation(operation):
    """Confirm that the input operation for consensus is currently supported.

    Parameters
    ----------
    operation: str
        Consensus operation to provide.

    Returns
    -------
    str
        Correctly formatted operation method.

    """
    operation = operation.lower()
    avail_ops = ["modz"]  # All aggregation operations are also supported
    try:
        operation = check_aggregate_operation(operation)
    except AssertionError:
        assert (  # noqa: S101
            operation in avail_ops
        ), f"operation {operation} not supported, select one of {avail_ops} or see aggregate.py"

    return operation

check_correlation_method(method)

Confirm that the input method is currently supported.

Parameters:

Name Type Description Default
method str

The correlation metric to use.

required

Returns:

Type Description
str

Correctly formatted correlation method.

Source code in pycytominer/cyto_utils/util.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def check_correlation_method(method):
    """Confirm that the input method is currently supported.

    Parameters
    ----------
    method : str
        The correlation metric to use.

    Returns
    -------
    str
        Correctly formatted correlation method.

    """
    method = method.lower()
    avail_methods = ["pearson", "spearman", "kendall"]
    assert (  # noqa: S101
        method in avail_methods
    ), f"method {method} not supported, select one of {avail_methods}"

    return method

check_fields_of_view(data_fields_of_view, input_fields_of_view)

Confirm that the input list of fields of view is a subset of the list of fields of view in the image table.

Parameters:

Name Type Description Default
data_fields_of_view list of int

Fields of view in the image table.

required
input_fields_of_view list of int

Input fields of view.

required

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/util.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def check_fields_of_view(data_fields_of_view, input_fields_of_view):
    """Confirm that the input list of fields of view is a subset of the list of fields of view in the image table.

    Parameters
    ----------
    data_fields_of_view : list of int
        Fields of view in the image table.
    input_fields_of_view : list of int
        Input fields of view.

    Returns
    -------
    None
        Nothing is returned.

    """
    try:
        assert len(  # noqa: S101
            list(np.intersect1d(data_fields_of_view, input_fields_of_view))
        ) == len(input_fields_of_view)
    except AssertionError:
        raise ValueError(
            "Some of the input fields of view are not present in the image table."
        )

check_fields_of_view_format(fields_of_view)

Confirm that the input fields of view is valid.

Parameters:

Name Type Description Default
fields_of_view list of int

List of integer fields of view.

required

Returns:

Type Description
str or list of int

Correctly formatted fields_of_view variable.

Source code in pycytominer/cyto_utils/util.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def check_fields_of_view_format(fields_of_view):
    """Confirm that the input fields of view is valid.

    Parameters
    ----------
    fields_of_view : list of int
        List of integer fields of view.

    Returns
    -------
    str or list of int
        Correctly formatted fields_of_view variable.

    """
    if fields_of_view != "all":
        if isinstance(fields_of_view, list):
            if all(isinstance(x, int) for x in fields_of_view):
                return fields_of_view
            else:
                try:
                    return list(map(int, fields_of_view))
                except ValueError:
                    raise TypeError(
                        "Variables of type int expected, however some of the input fields of view are not integers."
                    )
        else:
            raise TypeError(
                f"Variable of type list expected, however type {type(fields_of_view)} was passed."
            )
    else:
        return fields_of_view

check_image_features(image_features, image_columns)

Confirm that the input list of image features are present in the image table.

Parameters:

Name Type Description Default
image_features

Input image features to extract from the image table.

required
image_columns

Columns in the image table

required

Returns:

Type Description
None

Nothing is returned.

Source code in pycytominer/cyto_utils/util.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def check_image_features(image_features, image_columns):
    """Confirm that the input list of image features are present in the image table.

    Parameters
    ----------
    image_features: list of str
        Input image features to extract from the image table.
    image_columns: list of str
        Columns in the image table

    Returns
    -------
    None
        Nothing is returned.
    """
    if "Image" in list({img_col.split("_")[0] for img_col in image_columns}):
        # Image has already been prepended to most, but not all, columns
        level = 1
        image_columns = [x for x in image_columns if "_" in x]
    else:
        level = 0

    try:
        assert all(  # noqa: S101
            feature in list({img_col.split("_")[level] for img_col in image_columns})
            for feature in image_features
        )
    except AssertionError:
        raise ValueError(
            "Some of the input image features are not present in the image table."
        )

extract_image_features(image_feature_categories, image_df, image_cols, strata)

Confirm that the input list of image features categories are present in the image table and then extract those features.

Parameters:

Name Type Description Default
image_feature_categories list of str

Input image feature groups to extract from the image table.

required
image_df DataFrame

Image dataframe.

required
image_cols list of str

Columns to select from the image table.

required
strata list of str

The columns to groupby and aggregate single cells.

required

Returns:

Name Type Description
image_features_df DataFrame

Dataframe with extracted image features.

image_feature_categories list of str

Correctly formatted image feature categories.

Source code in pycytominer/cyto_utils/util.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
def extract_image_features(image_feature_categories, image_df, image_cols, strata):
    """Confirm that the input list of image features categories are present in the image table and then extract those features.

    Parameters
    ----------
    image_feature_categories : list of str
        Input image feature groups to extract from the image table.
    image_df : pandas.core.frame.DataFrame
        Image dataframe.
    image_cols : list of str
        Columns to select from the image table.
    strata :  list of str
        The columns to groupby and aggregate single cells.

    Returns
    -------
    image_features_df : pandas.core.frame.DataFrame
        Dataframe with extracted image features.
    image_feature_categories : list of str
        Correctly formatted image feature categories.

    """
    # Check if the input image feature groups are valid.
    check_image_features(image_feature_categories, list(image_df.columns))

    # Extract Image features from image_feature_categories
    image_features = list(
        image_df.columns[
            image_df.columns.str.startswith(tuple(image_feature_categories))
        ]
    )

    image_features_df = image_df[image_features]

    image_features_df.columns = [
        f"Image_{x}"
        if not x.startswith("Image_") and not x.startswith("Count_")
        else f"Metadata_{x}"
        if x.startswith("Count_")
        else x
        for x in image_features_df.columns
    ]

    # Add image_cols and strata to the dataframe
    image_features_df = pd.concat(
        [image_df[list(np.union1d(image_cols, strata))], image_features_df], axis=1
    )

    return image_features_df

get_default_compartments()

Return default compartments.

Returns:

Type Description
list of str

Default compartments.

Source code in pycytominer/cyto_utils/util.py
16
17
18
19
20
21
22
23
24
25
def get_default_compartments():
    """Return default compartments.

    Returns
    -------
    list of str
        Default compartments.

    """
    return ["cells", "cytoplasm", "nuclei"]

get_pairwise_correlation(population_df, method='pearson')

Given a population dataframe, calculate all pairwise correlations.

Parameters:

Name Type Description Default
population_df DataFrame

Includes metadata and observation features.

required
method str

Which correlation matrix to use to test cutoff.

"pearson"

Returns:

Type Description
list of str

Features to exclude from the population_df.

Source code in pycytominer/cyto_utils/util.py
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
def get_pairwise_correlation(population_df, method="pearson"):
    """Given a population dataframe, calculate all pairwise correlations.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        Includes metadata and observation features.
    method : str, default "pearson"
        Which correlation matrix to use to test cutoff.

    Returns
    -------
    list of str
        Features to exclude from the population_df.

    """
    # Check that the input method is supported
    method = check_correlation_method(method)

    # Get a symmetrical correlation matrix. Use numpy for non NaN/Inf matrices.
    has_nan = np.any(np.isnan(population_df.values))
    has_inf = np.any(np.isinf(population_df.values))
    if method == "pearson" and not (has_nan or has_inf):
        pop_names = population_df.columns
        data_cor_df = np.corrcoef(population_df.transpose())
        data_cor_df = pd.DataFrame(data_cor_df, index=pop_names, columns=pop_names)
    else:
        data_cor_df = population_df.corr(method=method)

    # Create a copy of the dataframe to generate upper triangle of zeros
    data_cor_natri_df = data_cor_df.copy()

    # Replace upper triangle in correlation matrix with NaN
    data_cor_natri_df = data_cor_natri_df.where(
        np.tril(np.ones(data_cor_natri_df.shape), k=-1).astype(bool)
    )

    # Acquire pairwise correlations in a long format
    # Note that we are using the NaN upper triangle DataFrame
    pairwise_df = data_cor_natri_df.stack().reset_index()
    pairwise_df.columns = ["pair_a", "pair_b", "correlation"]

    return data_cor_df, pairwise_df

load_known_metadata_dictionary(metadata_file=default_metadata_file)

Load previously known metadata columns per compartment from metadata text file.

Parameters:

Name Type Description Default
metadata_file str

File location of the metadata text file which should be a tab-separated file with two columns: ["compartment", "feature"]. If not provided, the default metadata file will be used.

default_metadata_file

Returns:

Type Description
dict

Compartment (keys) mappings to previously known metadata (values).

Source code in pycytominer/cyto_utils/util.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def load_known_metadata_dictionary(metadata_file=default_metadata_file):
    """Load previously known metadata columns per compartment from metadata text file.

    Parameters
    ----------
    metadata_file : str, optional
        File location of the metadata text file which should be a tab-separated file with two columns: ["compartment", "feature"].
        If not provided, the default metadata file will be used.

    Returns
    -------
    dict
        Compartment (keys) mappings to previously known metadata (values).

    """
    metadata_dict = {}
    with open(metadata_file) as meta_fh:
        next(meta_fh)
        for line in meta_fh:
            compartment, feature = line.strip().split("\t")
            compartment = compartment.lower()
            if compartment in metadata_dict:
                metadata_dict[compartment].append(feature)
            else:
                metadata_dict[compartment] = [feature]

    return metadata_dict

pycytominer.cyto_utils.write_gct

Module to write a gct file from a pandas DataFrame.

Transform profiles into a gct (Gene Cluster Text) file A gct is a tab deliminted text file that traditionally stores gene expression data File Format Description: https://clue.io/connectopedia/gct_format.

Modified from cytominer_scripts "write_gcg" written in R https://github.com/broadinstitute/cytominer_scripts/blob/master/write_gct.R

write_gct(profiles, output_file, features='infer', meta_features='infer', feature_metadata=None, version='#1.3')

Convert profiles to a .gct file.

Parameters:

Name Type Description Default
profiles DataFrame

DataFrame of profiles.

required
output_file str

If provided, will write gct to file.

required
features list

A list of strings corresponding to feature measurement column names in the profiles DataFrame. All features listed must be found in profiles. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm".

'infer'
meta_features list

A list of strings corresponding to metadata column names in the profiles DataFrame. All features listed must be found in profiles. Defaults to "infer". If "infer", then assume metadata features are those prefixed with "Metadata"

'infer'
feature_metadata DataFrame
None
version str

Important for gct loading into Morpheus

"#1.3"

Returns:

Type Description
None

Writes gct to file

Source code in pycytominer/cyto_utils/write_gct.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def write_gct(
    profiles,
    output_file,
    features="infer",
    meta_features="infer",
    feature_metadata=None,
    version="#1.3",
):
    """Convert profiles to a .gct file.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame
        DataFrame of profiles.
    output_file : str
        If provided, will write gct to file.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
        If "infer", then assume metadata features are those prefixed with "Metadata"
    feature_metadata : pandas.core.frame.DataFrame, default None
    version : str, default "#1.3"
        Important for gct loading into Morpheus

    Returns
    -------
    None
        Writes gct to file
    """
    # Note, only version 1.3 is currently supported
    assert version == "#1.3", "Only version #1.3 is currently supported."  # noqa: S101

    # Step 1: Create first two rows of data
    if features == "infer":
        features = infer_cp_features(profiles)
    feature_df = profiles.loc[:, features].reset_index(drop=True).transpose()

    # Separate out metadata features
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)
    metadata_df = profiles.loc[:, meta_features]

    # Step 2: Get the sample metadata portion of the output file
    metadata_part = metadata_df.transpose()
    metadata_part.columns = [f"SAMPLE_{x}" for x in metadata_part.columns]
    metadata_part = (
        metadata_part.transpose()
        .reset_index()
        .rename({"index": "id"}, axis="columns")
        .transpose()
    )
    metadata_part.index = [x.replace("Metadata_", "") for x in metadata_part.index]

    nrow_feature, ncol_features = feature_df.shape
    _, ncol_metadata = metadata_df.shape

    # Step 3: Compile feature metadata
    full_df = pd.concat([metadata_part, feature_df], axis="rows")
    if isinstance(feature_metadata, pd.DataFrame):
        nrow_metadata = feature_metadata.shape[1]
        assert (  # noqa: S101
            "id" in feature_metadata.index.tolist()
        ), "make sure feature metadata has row named 'id' that stores feature metadata names!"
        full_df = feature_metadata.merge(
            full_df, how="right", left_index=True, right_index=True
        )
    else:
        feature_metadata = (
            ["cp_feature_name"] + [np.nan] * ncol_metadata + feature_df.index.tolist()
        )
        nrow_metadata = 1
        full_df.insert(0, column="feature_metadata", value=feature_metadata)
    full_df = full_df.reset_index()

    # Step 4: Compile all data dimensions
    data_dimensions = [nrow_feature, ncol_features, nrow_metadata, ncol_metadata]

    # Step 5: Write output gct file
    with open(output_file, "w", newline="") as gctfile:
        gctwriter = csv.writer(gctfile, delimiter="\t")
        gctwriter.writerow([version])
        gctwriter.writerow(data_dimensions)
        for feature, row in full_df.iterrows():
            gctwriter.writerow(row)