-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathPredictiveAnalyticsML_Lab04-Classification-DT-Churn-Classification.json
1 lines (1 loc) · 14.1 KB
/
PredictiveAnalyticsML_Lab04-Classification-DT-Churn-Classification.json
1
{"paragraphs":[{"title":"Over view of Decision tree classification experiment ","text":"%md\n%%% yuml type=activity scale=80 dir=LR \n(data)[random 70%]->(trainingData),(data)[random 30%]->(tesData),(trainingData)[fit]->|b|,|b|->(combineFeature),(combineFeature)->(labelIndexer),(labelIndexer)->(DT),(DT)->|c|,|c|->(<combineFeature>),(<combineFeature>)->(<labelIndexer>),(<labelIndexer>)->(<DT>),(<DT>)->|d|,(tesData)->|c|,|d|[transform]->(prediction)\n%%%","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"tableHide":false,"editorSetting":{"language":"markdown","editOnDblClick":true},"colWidth":12,"editorMode":"ace/mode/markdown","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"HTML","data":"<div class=\"markdown-body\">\n<img src=\"http://yuml.me/diagram/scruffy;dir:LR;scale:80/activity/%28data%29%5Brandom+70%25%5D-%3E%28trainingData%29%2C%28data%29%5Brandom+30%25%5D-%3E%28tesData%29%2C%28trainingData%29%5Bfit%5D-%3E%7Cb%7C%2C%7Cb%7C-%3E%28combineFeature%29%2C%28combineFeature%29-%3E%28labelIndexer%29%2C%28labelIndexer%29-%3E%28DT%29%2C%28DT%29-%3E%7Cc%7C%2C%7Cc%7C-%3E%28%3CcombineFeature%3E%29%2C%28%3CcombineFeature%3E%29-%3E%28%3ClabelIndexer%3E%29%2C%28%3ClabelIndexer%3E%29-%3E%28%3CDT%3E%29%2C%28%3CDT%3E%29-%3E%7Cd%7C%2C%28tesData%29-%3E%7Cc%7C%2C%7Cd%7C%5Btransform%5D-%3E%28prediction%29.svg\" title=\"title\" />\n</div>"}]},"apps":[],"jobName":"paragraph_1604114159181_-97917439","id":"20180420-163423_119657940","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:4203"},{"title":"Using","text":"%md\n%%% yuml type=activity scale=80 dir=LR \n<a>[load]->(data),(data)->|b|,<c>[load pipeline]->(<combineFeature>),|b|->(<combineFeature>),(<combineFeature>)->(<labelIndexer>),(<labelIndexer>)->(<DT>),(<DT>)->|c|,|c|[transform]->(prediction)\n%%%","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"tableHide":false,"editorSetting":{"language":"markdown","editOnDblClick":true},"colWidth":12,"editorMode":"ace/mode/markdown","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"HTML","data":"<div class=\"markdown-body\">\n<img src=\"http://yuml.me/diagram/scruffy;dir:LR;scale:80/activity/%3Ca%3E%5Bload%5D-%3E%28data%29%2C%28data%29-%3E%7Cb%7C%2C%3Cc%3E%5Bload+pipeline%5D-%3E%28%3CcombineFeature%3E%29%2C%7Cb%7C-%3E%28%3CcombineFeature%3E%29%2C%28%3CcombineFeature%3E%29-%3E%28%3ClabelIndexer%3E%29%2C%28%3ClabelIndexer%3E%29-%3E%28%3CDT%3E%29%2C%28%3CDT%3E%29-%3E%7Cc%7C%2C%7Cc%7C%5Btransform%5D-%3E%28prediction%29.svg\" title=\"title\" />\n</div>"}]},"apps":[],"jobName":"paragraph_1604114159182_-352210922","id":"20180420-163430_1328743461","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4204"},{"title":"Download ข้อมูล","text":"%sh\nif ! hdfs dfs -test -e \"/user/zeppelin/dataset\"; then\n hdfs dfs -mkdir -p /user/zeppelin/dataset\nfi\nif ! hdfs dfs -test -e \"/user/zeppelin/dataset/churn_data.csv\"; then\n wget https://github.com/praisan/hello-world/raw/master/churn_data.csv\n hdfs dfs -put churn_data.csv /user/zeppelin/dataset\nfi","user":"anonymous","dateUpdated":"2020-10-31T03:17:05+0000","config":{"editorSetting":{"language":"sh","editOnDblClick":false,"completionSupport":false},"colWidth":12,"editorMode":"ace/mode/sh","editorHide":false,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159183_702518702","id":"20180420-083255_1448992339","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4205"},{"title":"DataFrame","text":"%pyspark\n# 1. Load data\ndata=spark.read.csv(\"/user/zeppelin/dataset/churn_data.csv\",inferSchema=True,header=True)\ndata = data.withColumnRenamed(\"Churn\",\"label\")\nprint(\"1.1 Load data\")\ndata.printSchema()\ndata.show(5)","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159183_-821878947","id":"20180420-104428_2111886087","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4206"},{"text":"%pyspark\n(trainingSet, testSet) = data.randomSplit([0.7, 0.3],seed=10)\nz.show(trainingSet)","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","editorHide":false,"fontSize":9,"results":{"0":{"graph":{"mode":"scatterChart","height":300,"optionOpen":false,"setting":{"lineChart":{},"scatterChart":{"group":{"name":"label","index":20,"aggr":"sum"},"xAxis":{"name":"DayMins","index":2,"aggr":"sum"},"yAxis":{"name":"EveMins","index":3,"aggr":"sum"},"size":{"name":"label","index":20,"aggr":"sum"}}},"keys":[{"name":"Sensor3","index":7,"aggr":"sum"}],"groups":[],"values":[{"name":"label","index":26,"aggr":"sum"}]},"helium":{}}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159184_-370160928","id":"20180420-103403_455918027","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4207"},{"title":"Transformers and Parameters","text":"%pyspark\nfrom pyspark.ml import Pipeline\nfrom pyspark.ml.feature import VectorAssembler, StandardScaler\nfrom pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit\nfrom pyspark.ml.classification import DecisionTreeClassifier\nfrom pyspark.ml.feature import StringIndexer, VectorIndexer\nfrom pyspark.ml.evaluation import MulticlassClassificationEvaluator\n\n# Transformers\ncombineFeature = VectorAssembler(\n inputCols=[\"DayMins\", \"EveMins\", \"NightMins\", \"IntlMins\", \"CustServCalls\"],\n outputCol=\"combinedFeatures\")\n\n# Transformers \n\n\nlabelIndexer = StringIndexer(inputCol=\"label\", outputCol=\"indexedLabel\")\n\n# Transformers\ndt = DecisionTreeClassifier(labelCol=\"indexedLabel\", featuresCol=\"combinedFeatures\", maxBins=32)\n\n# Model selection and hyperparameter tuning\nparamGrid = ParamGridBuilder()\\\n .addGrid(dt.impurity, ['gini','entropy']) \\\n .addGrid(dt.maxDepth, [5, 10, 20])\\\n .build()\n\ntvs = TrainValidationSplit(estimator=dt,\n estimatorParamMaps=paramGrid,\n evaluator=MulticlassClassificationEvaluator(metricName=\"f1\"),\n # 80% of the data will be used for training, 20% for validation.\n trainRatio=0.8)\n# Pipeline \npipeline = Pipeline(stages=[combineFeature,labelIndexer, tvs])\n","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159184_1741436965","id":"20180420-083718_1926967630","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4208"},{"title":"Model Traning","text":"%pyspark\n# Estimator\nmodel = pipeline.fit(trainingSet)","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","editorHide":false,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159184_-1153750990","id":"20180420-083749_1825669107","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4209"},{"text":"%pyspark\nprint(model.stages[2].bestModel.toDebugString)","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159184_1347848534","id":"20180420-111722_1498007173","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4210"},{"title":"Model Using","text":"%pyspark\n# Transformers\nprediction = model.transform(testSet)","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159185_1547233407","id":"20180420-100433_1980506948","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4211"},{"text":"%pyspark\nz.show(prediction)","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{"0":{"graph":{"mode":"table","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{},"tableColumnTypeState":{"names":{"AccountLength":"string","VMailMessage":"string","DayMins":"string","EveMins":"string","NightMins":"string","IntlMins":"string","CustServCalls":"string","IntPlan":"string","VMailPlan":"string","DayCalls":"string","DayCharge":"string","EveCalls":"string","EveCharge":"string","NightCalls":"string","NightCharge":"string","IntlCalls":"string","IntlCharge":"string","State":"string","AreaCode":"string","Phone":"string","label":"string","combinedFeatures":"string","indexedLabel":"string","rawPrediction":"string","probability":"string","prediction":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false}},"commonSetting":{}}}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159185_1979493885","id":"20180420-160029_668256473","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4212"},{"title":"Analyze results","text":"%pyspark\nevaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\", labelCol=\"label\")\nprint(\"Accuracy\",evaluator.evaluate(prediction, {evaluator.metricName: \"accuracy\"}))\nprint(\"F1\",evaluator.evaluate(prediction, {evaluator.metricName: \"f1\"}))","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","editorHide":false,"fontSize":9,"title":true,"results":{"1":{"graph":{"mode":"lineChart","height":300,"optionOpen":false,"setting":{"lineChart":{"lineWithFocus":false}},"commonSetting":{},"keys":[{"name":"Date","index":0,"aggr":"sum"}],"groups":[],"values":[{"name":"label","index":1,"aggr":"sum"},{"name":"prediction","index":11,"aggr":"sum"}]},"helium":{}}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159185_793763331","id":"20180420-100757_1596878955","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4213"},{"text":"%pyspark\nfrom pyspark.mllib.evaluation import MulticlassMetrics\nmetrics = MulticlassMetrics(prediction[[\"prediction\",\"label\"]].rdd.map(lambda x:[float(x[0]),float(x[1])]))\n\n# Confusion matrix\nprint(\"Confusion matrix:\")\nprint(metrics.confusionMatrix())","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159185_-1362177613","id":"20180420-160954_1605359117","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4214"},{"text":"%pyspark\n","user":"anonymous","dateUpdated":"2020-10-31T03:15:59+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1604114159186_-1719881239","id":"20190318-152633_421553608","dateCreated":"2020-10-31T03:15:59+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4215"}],"name":"PredictiveAnalyticsML/Lab04-Classification-DT-Churn-Classification","id":"2FN1W727X","noteParams":{},"noteForms":{},"angularObjects":{"md:shared_process":[],"sh:shared_process":[],"spark:shared_process":[]},"config":{"isZeppelinNotebookCronEnable":false,"looknfeel":"default","personalizedMode":"false"},"info":{}}