|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +""" |
| 4 | +Copyright (c) 2016 Randal S. Olson |
| 5 | +
|
| 6 | +Permission is hereby granted, free of charge, to any person obtaining a copy of this software |
| 7 | +and associated documentation files (the "Software"), to deal in the Software without restriction, |
| 8 | +including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 9 | +and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, |
| 10 | +subject to the following conditions: |
| 11 | +
|
| 12 | +The above copyright notice and this permission notice shall be included in all copies or substantial |
| 13 | +portions of the Software. |
| 14 | +
|
| 15 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT |
| 16 | +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| 17 | +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 18 | +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 19 | +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 20 | +""" |
| 21 | + |
| 22 | +from __future__ import print_function |
| 23 | +import pandas as pd |
| 24 | +import argparse |
| 25 | + |
| 26 | +def autoclean(input_dataframe): |
| 27 | + """Performs a series of automated data cleaning transformations on the provided data set |
| 28 | +
|
| 29 | + Parameters |
| 30 | + ---------- |
| 31 | + input_dataframe: pandas.DataFrame |
| 32 | + Data set to clean |
| 33 | +
|
| 34 | + Returns |
| 35 | + ---------- |
| 36 | + output_dataframe: pandas.DataFrame |
| 37 | + Cleaned data set |
| 38 | +
|
| 39 | + """ |
| 40 | + return |
| 41 | + |
| 42 | +def autoclean_cv(training_dataframe, testing_dataframe): |
| 43 | + """Performs a series of automated data cleaning transformations on the provided training and testing data sets |
| 44 | + |
| 45 | + Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations from only the training set, then |
| 46 | + applying those transformations to both the training and testing set. By doing so, this function will prevent information leak from the |
| 47 | + training set into the testing set. |
| 48 | +
|
| 49 | + Parameters |
| 50 | + ---------- |
| 51 | + training_dataframe: pandas.DataFrame |
| 52 | + Training data set |
| 53 | + |
| 54 | + testing_dataframe: pandas.DataFrame |
| 55 | + Testing data set |
| 56 | +
|
| 57 | + Returns |
| 58 | + ---------- |
| 59 | + output_training_dataframe: pandas.DataFrame |
| 60 | + Cleaned training data set |
| 61 | +
|
| 62 | + output_testing_dataframe: pandas.DataFrame |
| 63 | + Cleaned testing data set |
| 64 | +
|
| 65 | + """ |
| 66 | + return |
| 67 | + |
| 68 | +def main(): |
| 69 | + """Main function that is called when datacleaner is run on the command line""" |
| 70 | + from _version import __version__ |
| 71 | + |
| 72 | + parser = argparse.ArgumentParser(description='A Python tool that automatically cleans data sets and readies them for analysis') |
| 73 | + |
| 74 | + parser.add_argument('INPUT_FILENAME', type=str, help='Data file to clean') |
| 75 | + |
| 76 | + parser.add_argument('-o', action='store', dest='OUTPUT_FILENAME', default=None, |
| 77 | + type=str, help='Data file to output to') |
| 78 | + |
| 79 | + parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t', |
| 80 | + type=str, help='Column separator for the input file (default: \\t)') |
| 81 | + |
| 82 | + parser.add_argument('-os', action='store', dest='OUTPUT_SEPARATOR', default='\t', |
| 83 | + type=str, help='Column separator for the output file (default: \\t)') |
| 84 | + |
| 85 | + parser.add_argument('--version', action='version', |
| 86 | + version='datacleaner v{version}'.format(version=__version__)) |
| 87 | + |
| 88 | + args = parser.parse_args() |
| 89 | + |
| 90 | + |
| 91 | + |
| 92 | +if __name__ == '__main__': |
| 93 | + main() |
0 commit comments