basic integration of category encoders

rhiever · Mar 2, 2016 · 44c04c0 · 44c04c0
1 parent 63f51a2
commit 44c04c0
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ datacleaner can be used on the command line. Use `--help` to see its usage instr
 
 ```
 usage: datacleaner [-h] [-cv CROSS_VAL_FILENAME] [-o OUTPUT_FILENAME]
-                   [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR]
+                   [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR] [-en ENCODER]
                    [-os OUTPUT_SEPARATOR] [--drop-nans] [--version]
                    INPUT_FILENAME
 
@@ -63,6 +63,7 @@ optional arguments:
                         Data file to output the cleaned cross-validation data
                         set to
   -is INPUT_SEPARATOR   Column separator for the input file(s) (default: \t)
+  -en ENCODER           Name of encoder to use (from category_encoders) (default: None)
   -os OUTPUT_SEPARATOR  Column separator for the output file(s) (default: \t)
   --drop-nans           Drop all rows that have a NaN in any column (default:
                         False)
@@ -95,6 +96,9 @@ autoclean(input_dataframe, drop_nans=False, copy=False)
     
     copy: bool
         Make a copy of the data set (default: False)
+        
+    encoder: str
+        The name of an encoder from category_encoders to use (default: None)
     
     Returns
     ----------
@@ -123,6 +127,9 @@ autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False)
     
     copy: bool
         Make a copy of the data set (default: False)
+        
+    encoder: str
+        The name of an encoder from category_encoders to use (default: None)
     
     Returns
     ----------

diff --git a/datacleaner/datacleaner.py b/datacleaner/datacleaner.py
@@ -26,12 +26,14 @@
 
 import argparse
 from update_checker import update_check
+import category_encoders
 
 from ._version import __version__
 
 update_checked = False
 
-def autoclean(input_dataframe, drop_nans=False, copy=False):
+
+def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None):
     """Performs a series of automated data cleaning transformations on the provided data set
 
     Parameters
@@ -45,6 +47,9 @@ def autoclean(input_dataframe, drop_nans=False, copy=False):
     copy: bool
         Make a copy of the data set (default: False)
 
+    encoder: str
+        The name of an encoder from category_encoders to use (default: None)
+
     Returns
     ----------
     output_dataframe: pandas.DataFrame
@@ -62,6 +67,7 @@ def autoclean(input_dataframe, drop_nans=False, copy=False):
     if drop_nans:
         input_dataframe.dropna(inplace=True)
 
+    obj_cols = []
     for column in input_dataframe.columns.values:
         # Replace NaNs with the median or mode of the column depending on the column type
         # If there are very many levels in the column, then it is probably continuous
@@ -71,12 +77,19 @@ def autoclean(input_dataframe, drop_nans=False, copy=False):
             input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)
 
         # Encode all strings with numerical equivalents
-        if str(input_dataframe[column].values.dtype) == 'object':
-            input_dataframe[column] = LabelEncoder().fit_transform(input_dataframe[column].values)
+            if str(input_dataframe[column].values.dtype) == 'object':
+                if encoder is None:
+                    input_dataframe[column] = LabelEncoder().fit_transform(input_dataframe[column].values)
+                else:
+                    obj_cols.append(column)
+
+    if encoder is not None:
+        input_dataframe = category_encoders.__dict__[encoder].fit_transform(input_dataframe)
 
     return input_dataframe
 
-def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False):
+
+def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None):
     """Performs a series of automated data cleaning transformations on the provided training and testing data sets
 
     Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations
@@ -97,6 +110,9 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa
     copy: bool
         Make a copy of the data set (default: False)
 
+    encoder: str
+        The name of an encoder from category_encoders to use (default: None)
+
     Returns
     ----------
     output_training_dataframe: pandas.DataFrame
@@ -123,6 +139,7 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa
         training_dataframe.dropna(inplace=True)
         testing_dataframe.dropna(inplace=True)
 
+    obj_col = []
     for column in training_dataframe.columns.values:
         # Replace NaNs with the median or mode of the column depending on the column type
         # If there are very many levels in the column, then it is probably continuous
@@ -137,12 +154,21 @@ def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=Fa
 
         # Encode all strings with numerical equivalents
         if str(training_dataframe[column].values.dtype) == 'object':
-            column_label_encoder = LabelEncoder().fit(training_dataframe[column].values)
-            training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values)
-            testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values)
+            if encoder is None:
+                column_label_encoder = LabelEncoder().fit(training_dataframe[column].values)
+                training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values)
+                testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values)
+            else:
+                obj_col.append(column)
+
+    if encoder is not None:
+        enc = category_encoders.__dict__[encoder].fit(training_dataframe)
+        training_dataframe = enc.transform(training_dataframe)
+        testing_dataframe = enc.transform(testing_dataframe)
 
     return training_dataframe, testing_dataframe
 
+
 def main():
     """Main function that is called when datacleaner is run on the command line"""
     parser = argparse.ArgumentParser(description='A Python tool that automatically cleans data sets and readies them for analysis')
@@ -152,6 +178,9 @@ def main():
     parser.add_argument('-cv', action='store', dest='CROSS_VAL_FILENAME', default=None,
                          type=str, help='File name for the validation data set if performing cross-validation')
 
+    parser.add_argument('-en', action='store', dest='ENCODER', default=None,
+                         type=str, help='Name of encoder from category_encoders library to use for obj fields')
+
     parser.add_argument('-o', action='store', dest='OUTPUT_FILENAME', default=None,
                         type=str, help='Data file to output the cleaned data set to')
 
@@ -173,7 +202,7 @@ def main():
 
     input_data = pd.read_csv(args.INPUT_FILENAME, sep=args.INPUT_SEPARATOR)
     if args.CROSS_VAL_FILENAME is None:
-        clean_data = autoclean(input_data, drop_nans=args.DROP_NANS)
+        clean_data = autoclean(input_data, drop_nans=args.DROP_NANS, encoder=args.ENCODER)
         if args.OUTPUT_FILENAME is None:
             print('Cleaned data set:')
             print(clean_data)
@@ -188,7 +217,7 @@ def main():
             return
 
         cross_val_data = pd.read_csv(args.CROSS_VAL_FILENAME, sep=args.INPUT_SEPARATOR)
-        clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, drop_nans=args.DROP_NANS)
+        clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, drop_nans=args.DROP_NANS, encoder=args.ENCODER)
 
         if args.OUTPUT_FILENAME is None:
             print('Cleaned training data set:')

diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@ def calculate_version():
 This project is hosted at https://github.com/rhiever/datacleaner
 ''',
     zip_safe=True,
-    install_requires=['pandas', 'scikit-learn', 'update_checker'],
+    install_requires=['pandas', 'scikit-learn', 'update_checker', 'category_encoders'],
     classifiers=[
         'Intended Audience :: Developers',
         'Intended Audience :: Information Technology',