From 44c04c0fab2692586e398ff11b53f52afbfb7267 Mon Sep 17 00:00:00 2001 From: Will McGinnis Date: Wed, 2 Mar 2016 16:45:32 -0500 Subject: [PATCH 1/4] basic integration of category encoders --- README.md | 9 +++++++- datacleaner/datacleaner.py | 47 ++++++++++++++++++++++++++++++-------- setup.py | 2 +- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 55d41a5..808b455 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ datacleaner can be used on the command line. Use `--help` to see its usage instr ``` usage: datacleaner [-h] [-cv CROSS_VAL_FILENAME] [-o OUTPUT_FILENAME] - [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR] + [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR] [-en ENCODER] [-os OUTPUT_SEPARATOR] [--drop-nans] [--version] INPUT_FILENAME @@ -63,6 +63,7 @@ optional arguments: Data file to output the cleaned cross-validation data set to -is INPUT_SEPARATOR Column separator for the input file(s) (default: \t) + -en ENCODER Name of encoder to use (from category_encoders) (default: None) -os OUTPUT_SEPARATOR Column separator for the output file(s) (default: \t) --drop-nans Drop all rows that have a NaN in any column (default: False) @@ -95,6 +96,9 @@ autoclean(input_dataframe, drop_nans=False, copy=False) copy: bool Make a copy of the data set (default: False) + + encoder: str + The name of an encoder from category_encoders to use (default: None) Returns ---------- @@ -123,6 +127,9 @@ autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False) copy: bool Make a copy of the data set (default: False) + + encoder: str + The name of an encoder from category_encoders to use (default: None) Returns ---------- diff --git a/datacleaner/datacleaner.py b/datacleaner/datacleaner.py index 791c49a..0211f36 100644 --- a/datacleaner/datacleaner.py +++ b/datacleaner/datacleaner.py @@ -26,12 +26,14 @@ import argparse from update_checker import update_check +import category_encoders from ._version import __version__ update_checked = False -def autoclean(input_dataframe, drop_nans=False, copy=False): + +def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None): """Performs a series of automated data cleaning transformations on the provided data set Parameters @@ -45,6 +47,9 @@ def autoclean(input_dataframe, drop_nans=False, copy=False): copy: bool Make a copy of the data set (default: False) + encoder: str + The name of an encoder from category_encoders to use (default: None) + Returns ---------- output_dataframe: pandas.DataFrame @@ -62,6 +67,7 @@ def autoclean(input_dataframe, drop_nans=False, copy=False): if drop_nans: input_dataframe.dropna(inplace=True) + obj_cols = [] for column in input_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous @@ -71,12 +77,19 @@ def autoclean(input_dataframe, drop_nans=False, copy=False): input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) # Encode all strings with numerical equivalents - if str(input_dataframe[column].values.dtype) == 'object': - input_dataframe[column] = LabelEncoder().fit_transform(input_dataframe[column].values) + if str(input_dataframe[column].values.dtype) == 'object': + if encoder is None: + input_dataframe[column] = LabelEncoder().fit_transform(input_dataframe[column].values) + else: + obj_cols.append(column) + + if encoder is not None: + input_dataframe = category_encoders.__dict__[encoder].fit_transform(input_dataframe) return input_dataframe -def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False): + +def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None): """Performs a series of automated data cleaning transformations on the provided training and testing data sets Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations @@ -97,6 +110,9 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa copy: bool Make a copy of the data set (default: False) + encoder: str + The name of an encoder from category_encoders to use (default: None) + Returns ---------- output_training_dataframe: pandas.DataFrame @@ -123,6 +139,7 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa training_dataframe.dropna(inplace=True) testing_dataframe.dropna(inplace=True) + obj_col = [] for column in training_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous @@ -137,12 +154,21 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa # Encode all strings with numerical equivalents if str(training_dataframe[column].values.dtype) == 'object': - column_label_encoder = LabelEncoder().fit(training_dataframe[column].values) - training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values) - testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values) + if encoder is None: + column_label_encoder = LabelEncoder().fit(training_dataframe[column].values) + training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values) + testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values) + else: + obj_col.append(column) + + if encoder is not None: + enc = category_encoders.__dict__[encoder].fit(training_dataframe) + training_dataframe = enc.transform(training_dataframe) + testing_dataframe = enc.transform(testing_dataframe) return training_dataframe, testing_dataframe + def main(): """Main function that is called when datacleaner is run on the command line""" parser = argparse.ArgumentParser(description='A Python tool that automatically cleans data sets and readies them for analysis') @@ -152,6 +178,9 @@ def main(): parser.add_argument('-cv', action='store', dest='CROSS_VAL_FILENAME', default=None, type=str, help='File name for the validation data set if performing cross-validation') + parser.add_argument('-en', action='store', dest='ENCODER', default=None, + type=str, help='Name of encoder from category_encoders library to use for obj fields') + parser.add_argument('-o', action='store', dest='OUTPUT_FILENAME', default=None, type=str, help='Data file to output the cleaned data set to') @@ -173,7 +202,7 @@ def main(): input_data = pd.read_csv(args.INPUT_FILENAME, sep=args.INPUT_SEPARATOR) if args.CROSS_VAL_FILENAME is None: - clean_data = autoclean(input_data, drop_nans=args.DROP_NANS) + clean_data = autoclean(input_data, drop_nans=args.DROP_NANS, encoder=args.ENCODER) if args.OUTPUT_FILENAME is None: print('Cleaned data set:') print(clean_data) @@ -188,7 +217,7 @@ def main(): return cross_val_data = pd.read_csv(args.CROSS_VAL_FILENAME, sep=args.INPUT_SEPARATOR) - clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, drop_nans=args.DROP_NANS) + clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, drop_nans=args.DROP_NANS, encoder=args.ENCODER) if args.OUTPUT_FILENAME is None: print('Cleaned training data set:') diff --git a/setup.py b/setup.py index bac22ef..eeaf4f0 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def calculate_version(): This project is hosted at https://github.com/rhiever/datacleaner ''', zip_safe=True, - install_requires=['pandas', 'scikit-learn', 'update_checker'], + install_requires=['pandas', 'scikit-learn', 'update_checker', 'category_encoders'], classifiers=[ 'Intended Audience :: Developers', 'Intended Audience :: Information Technology', From d71f01b09ddddf8265b62ecd22983cf9b92e0b4d Mon Sep 17 00:00:00 2001 From: Will McGinnis Date: Thu, 3 Mar 2016 08:44:29 -0500 Subject: [PATCH 2/4] refactored to only use custom encoders in script mode --- datacleaner/datacleaner.py | 50 ++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/datacleaner/datacleaner.py b/datacleaner/datacleaner.py index 0211f36..0a4bbbf 100644 --- a/datacleaner/datacleaner.py +++ b/datacleaner/datacleaner.py @@ -26,14 +26,13 @@ import argparse from update_checker import update_check -import category_encoders from ._version import __version__ update_checked = False -def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None): +def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None): """Performs a series of automated data cleaning transformations on the provided data set Parameters @@ -47,8 +46,11 @@ def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None): copy: bool Make a copy of the data set (default: False) - encoder: str - The name of an encoder from category_encoders to use (default: None) + encoder: category_encoders transformer + The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) + + encoder_kwargs: category_encoders + The a valid sklearn transformer to encode categorical features. Default (None) Returns ---------- @@ -61,6 +63,9 @@ def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None): update_check('datacleaner', __version__) update_checked = True + if encoder_kwargs is None: + encoder_kwargs = {} + if copy: input_dataframe = input_dataframe.copy() @@ -77,19 +82,19 @@ def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None): input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) # Encode all strings with numerical equivalents - if str(input_dataframe[column].values.dtype) == 'object': - if encoder is None: - input_dataframe[column] = LabelEncoder().fit_transform(input_dataframe[column].values) - else: - obj_cols.append(column) + if str(input_dataframe[column].values.dtype) == 'object': + if encoder is None: + input_dataframe[column] = encoder().fit_transform(input_dataframe[column].values) + else: + obj_cols.append(column) if encoder is not None: - input_dataframe = category_encoders.__dict__[encoder].fit_transform(input_dataframe) + input_dataframe = encoder(cols=obj_cols, **encoder_kwargs).fit_transform(input_dataframe) return input_dataframe -def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None): +def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None): """Performs a series of automated data cleaning transformations on the provided training and testing data sets Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations @@ -110,8 +115,11 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa copy: bool Make a copy of the data set (default: False) - encoder: str - The name of an encoder from category_encoders to use (default: None) + encoder: category_encoders transformer + The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) + + encoder_kwargs: category_encoders + The a valid sklearn transformer to encode categorical features. Default (None) Returns ---------- @@ -131,6 +139,9 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa raise ValueError('The training and testing DataFrames do not have the same columns. ' 'Make sure that you are providing the same columns.') + if encoder_kwargs is None: + encoder_kwargs = {} + if copy: training_dataframe = training_dataframe.copy() testing_dataframe = testing_dataframe.copy() @@ -139,7 +150,7 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa training_dataframe.dropna(inplace=True) testing_dataframe.dropna(inplace=True) - obj_col = [] + obj_cols = [] for column in training_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous @@ -159,10 +170,10 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values) testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values) else: - obj_col.append(column) + obj_cols.append(column) if encoder is not None: - enc = category_encoders.__dict__[encoder].fit(training_dataframe) + enc = encoder(cols=obj_cols, **encoder_kwargs).fit(training_dataframe) training_dataframe = enc.transform(training_dataframe) testing_dataframe = enc.transform(testing_dataframe) @@ -178,9 +189,6 @@ def main(): parser.add_argument('-cv', action='store', dest='CROSS_VAL_FILENAME', default=None, type=str, help='File name for the validation data set if performing cross-validation') - parser.add_argument('-en', action='store', dest='ENCODER', default=None, - type=str, help='Name of encoder from category_encoders library to use for obj fields') - parser.add_argument('-o', action='store', dest='OUTPUT_FILENAME', default=None, type=str, help='Data file to output the cleaned data set to') @@ -202,7 +210,7 @@ def main(): input_data = pd.read_csv(args.INPUT_FILENAME, sep=args.INPUT_SEPARATOR) if args.CROSS_VAL_FILENAME is None: - clean_data = autoclean(input_data, drop_nans=args.DROP_NANS, encoder=args.ENCODER) + clean_data = autoclean(input_data, drop_nans=args.DROP_NANS) if args.OUTPUT_FILENAME is None: print('Cleaned data set:') print(clean_data) @@ -217,7 +225,7 @@ def main(): return cross_val_data = pd.read_csv(args.CROSS_VAL_FILENAME, sep=args.INPUT_SEPARATOR) - clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, drop_nans=args.DROP_NANS, encoder=args.ENCODER) + clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, drop_nans=args.DROP_NANS) if args.OUTPUT_FILENAME is None: print('Cleaned training data set:') From ae4c87bc43172892785c94246b7bcc09dcde2fa1 Mon Sep 17 00:00:00 2001 From: Will McGinnis Date: Thu, 3 Mar 2016 08:44:40 -0500 Subject: [PATCH 3/4] updated read me and setup --- README.md | 17 +++++++++++------ setup.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 808b455..ef6eed6 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ datacleaner can be used on the command line. Use `--help` to see its usage instr ``` usage: datacleaner [-h] [-cv CROSS_VAL_FILENAME] [-o OUTPUT_FILENAME] - [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR] [-en ENCODER] + [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR] [-os OUTPUT_SEPARATOR] [--drop-nans] [--version] INPUT_FILENAME @@ -63,7 +63,6 @@ optional arguments: Data file to output the cleaned cross-validation data set to -is INPUT_SEPARATOR Column separator for the input file(s) (default: \t) - -en ENCODER Name of encoder to use (from category_encoders) (default: None) -os OUTPUT_SEPARATOR Column separator for the output file(s) (default: \t) --drop-nans Drop all rows that have a NaN in any column (default: False) @@ -97,8 +96,11 @@ autoclean(input_dataframe, drop_nans=False, copy=False) copy: bool Make a copy of the data set (default: False) - encoder: str - The name of an encoder from category_encoders to use (default: None) + encoder: category_encoders transformer + The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) + + encoder_kwargs: category_encoders + The a valid sklearn transformer to encode categorical features. Default (None) Returns ---------- @@ -128,8 +130,11 @@ autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False) copy: bool Make a copy of the data set (default: False) - encoder: str - The name of an encoder from category_encoders to use (default: None) + encoder: category_encoders transformer + The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) + + encoder_kwargs: category_encoders + The a valid sklearn transformer to encode categorical features. Default (None) Returns ---------- diff --git a/setup.py b/setup.py index eeaf4f0..bac22ef 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def calculate_version(): This project is hosted at https://github.com/rhiever/datacleaner ''', zip_safe=True, - install_requires=['pandas', 'scikit-learn', 'update_checker', 'category_encoders'], + install_requires=['pandas', 'scikit-learn', 'update_checker'], classifiers=[ 'Intended Audience :: Developers', 'Intended Audience :: Information Technology', From c7cd7383ac53d4061fa4e70ba64d8dec07c41779 Mon Sep 17 00:00:00 2001 From: Will McGinnis Date: Thu, 3 Mar 2016 13:18:06 -0500 Subject: [PATCH 4/4] updated to operate column wise --- datacleaner/datacleaner.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/datacleaner/datacleaner.py b/datacleaner/datacleaner.py index 0a4bbbf..07913fd 100644 --- a/datacleaner/datacleaner.py +++ b/datacleaner/datacleaner.py @@ -66,13 +66,15 @@ def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encode if encoder_kwargs is None: encoder_kwargs = {} + if encoder is None: + encoder = LabelEncoder + if copy: input_dataframe = input_dataframe.copy() if drop_nans: input_dataframe.dropna(inplace=True) - obj_cols = [] for column in input_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous @@ -83,13 +85,7 @@ def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encode # Encode all strings with numerical equivalents if str(input_dataframe[column].values.dtype) == 'object': - if encoder is None: - input_dataframe[column] = encoder().fit_transform(input_dataframe[column].values) - else: - obj_cols.append(column) - - if encoder is not None: - input_dataframe = encoder(cols=obj_cols, **encoder_kwargs).fit_transform(input_dataframe) + input_dataframe[column] = encoder(**encoder_kwargs).fit_transform(input_dataframe[column].values) return input_dataframe @@ -142,6 +138,9 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa if encoder_kwargs is None: encoder_kwargs = {} + if encoder is None: + encoder = LabelEncoder + if copy: training_dataframe = training_dataframe.copy() testing_dataframe = testing_dataframe.copy() @@ -150,7 +149,6 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa training_dataframe.dropna(inplace=True) testing_dataframe.dropna(inplace=True) - obj_cols = [] for column in training_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous @@ -166,16 +164,9 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa # Encode all strings with numerical equivalents if str(training_dataframe[column].values.dtype) == 'object': if encoder is None: - column_label_encoder = LabelEncoder().fit(training_dataframe[column].values) + column_label_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values) training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values) testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values) - else: - obj_cols.append(column) - - if encoder is not None: - enc = encoder(cols=obj_cols, **encoder_kwargs).fit(training_dataframe) - training_dataframe = enc.transform(training_dataframe) - testing_dataframe = enc.transform(testing_dataframe) return training_dataframe, testing_dataframe