# Data and Setup.


from pyspark.sql import SparkSession


spark = SparkSession.builder.appName('logregconsult').getOrCreate()


data = spark.read.csv('customer_churn.csv',inferSchema=True,
                     header=True)


data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)


data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969|                null|                null| 0.3728852122772358|
|    min|   Aaron King|             22.0|            100.0|                 0|              1.0|               3.0|00103 Jeffrey Cre...|     Abbott-Thompson|                  0|
|    max|Zachary Walsh|             65.0|         18026.01|                 1|             9.15|              14.0|Unit 9800 Box 287...|Zuniga, Clark and...|                  1|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+


data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']


# Format for MLlib.


from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],outputCol='features')


output = assembler.transform(data)


final_data = output.select('features','churn')


final_data.show(5)

+--------------------+-----+
|            features|churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
|[38.0,12884.75,0....|    1|
|[42.0,8010.76,0.0...|    1|
|[37.0,9191.58,0.0...|    1|
+--------------------+-----+
only showing top 5 rows


# Test, Train, Split.


train_churn,test_churn = final_data.randomSplit([0.7,0.3])


# Fitting the model.


from pyspark.ml.classification import LogisticRegression


lr_churn = LogisticRegression(labelCol='churn')


fitted_churn_model = lr_churn.fit(train_churn)


training_sum = fitted_churn_model.summary


training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                637|                637|
|   mean|0.16169544740973313|0.12244897959183673|
| stddev| 0.3684605252590973| 0.3280612469466197|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+


# Evaluating results.


from pyspark.ml.evaluation import BinaryClassificationEvaluator


pred_and_labels = fitted_churn_model.evaluate(test_churn)


pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[0.52247451935280...|[0.62772620940898...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.37725884472088...|[0.79855039786718...|       0.0|
|[29.0,5900.78,1.0...|    0|[3.60125148312658...|[0.97343538762390...|       0.0|
|[29.0,9378.24,0.0...|    0|[4.56778637312206...|[0.98972574240998...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.24495206175320...|[0.96249129785778...|       0.0|
|[30.0,11575.37,1....|    1|[3.79522948632507...|[0.97801639501919...|       0.0|
|[31.0,5387.75,0.0...|    0|[2.34579569888307...|[0.91259946663920...|       0.0|
|[31.0,8829.83,1.0...|    0|[4.03422121549688...|[0.98260836373827...|       0.0|
|[31.0,9574.89,0.0...|    0|[3.28489425791314...|[0.96390694369899...|       0.0|
|[32.0,8617.98,1.0...|    1|[0.91342946274594...|[0.71370142396379...|       0.0|
|[32.0,9472.72,1.0...|    0|[3.44940619487362...|[0.96921342717021...|       0.0|
|[32.0,11715.72,0....|    0|[3.40370999629175...|[0.96782028086827...|       0.0|
|[32.0,12142.99,0....|    0|[5.51856502702607...|[0.99600442828498...|       0.0|
|[33.0,7720.61,1.0...|    0|[1.43801025932777...|[0.80814633957511...|       0.0|
|[33.0,8556.73,0.0...|    0|[3.54900766774357...|[0.97205047881558...|       0.0|
|[33.0,13157.08,1....|    0|[1.48775334349903...|[0.81574082225073...|       0.0|
|[34.0,6461.86,1.0...|    0|[3.77906078333076...|[0.97766606257828...|       0.0|
|[34.0,7818.13,0.0...|    0|[3.67955174994879...|[0.97538681258149...|       0.0|
|[34.0,9845.35,0.0...|    0|[5.45867715230454...|[0.99575887745635...|       0.0|
|[34.0,10674.92,1....|    0|[3.45930675636331...|[0.96950747923217...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 20 rows


# Using AUC.


churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                           labelCol='churn')


auc = churn_eval.evaluate(pred_and_labels.predictions)

auc

0.7733451536643026


# Predictons on brand new unlabeled data.


final_lr_model = lr_churn.fit(final_data)


new_customers = spark.read.csv('new_customers.csv',inferSchema=True,
                              header=True)


new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)


test_new_customers = assembler.transform(new_customers)


test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)


final_results = final_lr_model.transform(test_new_customers)


final_results.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+


# The End.

Logistic Regression Project 2¶