import pandas as pd
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
file_path = '/content/drive/MyDrive/Portfolios/Data-Analyst/Gladys-Lacia-for-Data-Analyst-role/datasets/dataset-Customer-Churn-Telco-California-from-April-to-June-2022.csv'

df = pd.read_csv(file_path, low_memory=False)
df.info()

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender                             7043 non-null   object 
 2   Age                                7043 non-null   int64  
 3   Married                            7043 non-null   object 
 4   Number of Dependents               7043 non-null   int64  
 5   City                               7043 non-null   object 
 6   Zip Code                           7043 non-null   int64  
 7   Latitude                           7043 non-null   float64
 8   Longitude                          7043 non-null   float64
 9   Number of Referrals                7043 non-null   int64  
 10  Tenure in Months                   7043 non-null   int64  
 11  Offer                              3166 non-null   object 
 12  Phone Service                      7043 non-null   object 
 13  Avg Monthly Long Distance Charges  6361 non-null   float64
 14  Multiple Lines                     6361 non-null   object 
 15  Internet Service                   7043 non-null   object 
 16  Internet Type                      5517 non-null   object 
 17  Avg Monthly GB Download            5517 non-null   float64
 18  Online Security                    5517 non-null   object 
 19  Online Backup                      5517 non-null   object 
 20  Device Protection Plan             5517 non-null   object 
 21  Premium Tech Support               5517 non-null   object 
 22  Streaming TV                       5517 non-null   object 
 23  Streaming Movies                   5517 non-null   object 
 24  Streaming Music                    5517 non-null   object 
 25  Unlimited Data                     5517 non-null   object 
 26  Contract                           7043 non-null   object 
 27  Paperless Billing                  7043 non-null   object 
 28  Payment Method                     7043 non-null   object 
 29  Monthly Charge                     7043 non-null   float64
 30  Total Charges                      7043 non-null   float64
 31  Total Refunds                      7043 non-null   float64
 32  Total Extra Data Charges           7043 non-null   int64  
 33  Total Long Distance Charges        7043 non-null   float64
 34  Total Revenue                      7043 non-null   float64
 35  Customer Status                    7043 non-null   object 
 36  Churn Category                     1869 non-null   object 
 37  Churn Reason                       1869 non-null   object 
dtypes: float64(9), int64(6), object(23)
memory usage: 2.0+ MB

df = df.dropna(subset=['Churn Reason'])
df = df.dropna(subset=['Offer'])
df = df.dropna(subset=['Internet Type'])
df = df.dropna(subset=['Avg Monthly Long Distance Charges'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 681 entries, 2 to 7039
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        681 non-null    object 
 1   Gender                             681 non-null    object 
 2   Age                                681 non-null    int64  
 3   Married                            681 non-null    object 
 4   Number of Dependents               681 non-null    int64  
 5   City                               681 non-null    object 
 6   Zip Code                           681 non-null    int64  
 7   Latitude                           681 non-null    float64
 8   Longitude                          681 non-null    float64
 9   Number of Referrals                681 non-null    int64  
 10  Tenure in Months                   681 non-null    int64  
 11  Offer                              681 non-null    object 
 12  Phone Service                      681 non-null    object 
 13  Avg Monthly Long Distance Charges  681 non-null    float64
 14  Multiple Lines                     681 non-null    object 
 15  Internet Service                   681 non-null    object 
 16  Internet Type                      681 non-null    object 
 17  Avg Monthly GB Download            681 non-null    float64
 18  Online Security                    681 non-null    object 
 19  Online Backup                      681 non-null    object 
 20  Device Protection Plan             681 non-null    object 
 21  Premium Tech Support               681 non-null    object 
 22  Streaming TV                       681 non-null    object 
 23  Streaming Movies                   681 non-null    object 
 24  Streaming Music                    681 non-null    object 
 25  Unlimited Data                     681 non-null    object 
 26  Contract                           681 non-null    object 
 27  Paperless Billing                  681 non-null    object 
 28  Payment Method                     681 non-null    object 
 29  Monthly Charge                     681 non-null    float64
 30  Total Charges                      681 non-null    float64
 31  Total Refunds                      681 non-null    float64
 32  Total Extra Data Charges           681 non-null    int64  
 33  Total Long Distance Charges        681 non-null    float64
 34  Total Revenue                      681 non-null    float64
 35  Customer Status                    681 non-null    object 
 36  Churn Category                     681 non-null    object 
 37  Churn Reason                       681 non-null    object 
dtypes: float64(9), int64(6), object(23)
memory usage: 207.5+ KB

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
file_path = '/content/drive/MyDrive/Portfolios/Data-Analyst/Gladys-Lacia-for-Data-Analyst-role/datasets/dataset-Customer-Churn-Telco-California-from-April-to-June-2022.csv'

df = pd.read_csv(file_path, low_memory=False)

contract_counts_before = df['Contract'].value_counts()
print("Contract counts before data cleaning:\n", contract_counts_before)
total_customers_by_contract = contract_counts_before.copy()

df = df.dropna(subset=['Churn Reason', 'Offer', 'Internet Type', 'Avg Monthly Long Distance Charges'])

contract_counts_after = df['Contract'].value_counts()
print("\nContract counts after data cleaning:\n", contract_counts_after)
churned_df = df[df['Customer Status'] == 'Churned']
churned_customers_by_contract = churned_df['Contract'].value_counts()

churn_rate = (churned_customers_by_contract / total_customers_by_contract * 100).round(2)
churn_rate_by_contract = pd.DataFrame({
    'Total Customers': total_customers_by_contract,
    'Churned Customers': churned_customers_by_contract,
    'Churn Rate (%)': churn_rate
}).fillna(0).astype({'Churned Customers': 'int'})
print("\nChurn Rate by Contract Type:\n", churn_rate_by_contract)

plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
sns.barplot(x=churn_rate_by_contract.index,
            y='Churned Customers',
            data=churn_rate_by_contract,
            color='limegreen',
            label='Churned Customers')
sns.barplot(x=churn_rate_by_contract.index,
            y='Total Customers',
            data=churn_rate_by_contract,
            color='lime',
            alpha=0.5,
            label='Total Customers')
for i, (index, row) in enumerate(churn_rate_by_contract.iterrows()):
    plt.text(i, row['Churned Customers'] + 50, f"{row['Churn Rate (%)']}%",
             ha='center', va='bottom', fontsize=11, fontweight='bold', color='black')

plt.title('Churn Rate by Contract Type', fontsize=14)
plt.ylabel('Number of Customers')
plt.xlabel('Contract Type')
plt.legend()
plt.tight_layout()
plt.show()

Mounted at /content/drive
Contract counts before data cleaning:
 Contract
Month-to-Month    3610
Two Year          1883
One Year          1550
Name: count, dtype: int64

Contract counts after data cleaning:
 Contract
Month-to-Month    597
One Year           65
Two Year           19
Name: count, dtype: int64

Churn Rate by Contract Type:
                 Total Customers  Churned Customers  Churn Rate (%)
Contract                                                          
Month-to-Month             3610                597           16.54
One Year                   1550                 65            4.19
Two Year                   1883                 19            1.01

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
file_path = '/content/drive/MyDrive/Portfolios/Data-Analyst/Gladys-Lacia-for-Data-Analyst-role/datasets/dataset-Customer-Churn-Telco-California-from-April-to-June-2022.csv'
df = pd.read_csv(file_path, low_memory=False)
df = df.dropna(subset=['Churn Reason', 'Offer', 'Internet Type', 'Avg Monthly Long Distance Charges'])

churned_df = df[df['Customer Status'] == 'Churned'].copy()

bins_age = [0, 30, 45, 60, 100]
labels_age = ['18-30', '31-45', '46-60', '61+']
churned_df['Age Group'] = pd.cut(churned_df['Age'], bins=bins_age, labels=labels_age)

bins_tenure = [0, 12, 24, 36, 100]
labels_tenure = ['0-12', '13-24', '25-36', '37+']
churned_df['Tenure Group'] = pd.cut(churned_df['Tenure in Months'], bins=bins_tenure, labels=labels_tenure)

# ========== 1. Top Churn Reasons by Age Group ==========
age_reason = churned_df.groupby(['Age Group', 'Churn Reason'], observed=True).size().reset_index(name='Count')
top_age_reasons = age_reason.sort_values(['Age Group', 'Count'], ascending=[True, False])

plt.figure(figsize=(12, 6))
sns.barplot(data=top_age_reasons, x='Age Group', y='Count', hue='Churn Reason', dodge=False)
plt.title('Top Churn Reasons by Age Group')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Mounted at /content/drive

age_group_18_30_df = df[(df['Age'] >= 18) & (df['Age'] <= 30)]
churn_reasons_18_30 = age_group_18_30_df['Churn Reason'].value_counts()

total_churn_reasons_18_30 = churn_reasons_18_30.sum()
print("\nTotal churn reasons (customers) in age group 18–30:", total_churn_reasons_18_30)
top_reasons_18_30 = churn_reasons_18_30.head(100)
print("Top churn reasons for customers aged 18-30:\n", top_reasons_18_30)

Total churn reasons (customers) in age group 18–30: 143
Top churn reasons for customers aged 18-30:
 Churn Reason
Competitor made better offer                 23
Competitor had better devices                19
Attitude of support person                   18
Competitor offered more data                 17
Attitude of service provider                 16
Network reliability                          10
Competitor offered higher download speeds     7
Don't know                                    6
Service dissatisfaction                       5
Lack of affordable download/upload speed      3
Price too high                                3
Product dissatisfaction                       3
Moved                                         3
Long distance charges                         2
Extra data charges                            2
Lack of self-service on Website               2
Limited range of services                     1
Poor expertise of phone support               1
Deceased                                      1
Poor expertise of online support              1
Name: count, dtype: int64

age_group_31_45_df = df[(df['Age'] >= 31) & (df['Age'] <= 45)]
churn_reasons_31_45 = age_group_31_45_df['Churn Reason'].value_counts()

total_churn_reasons_31_45 = churn_reasons_31_45.sum()
print("\nTotal churn reasons (customers) in age group 31-45:", total_churn_reasons_31_45)
top_reasons_31_45 = churn_reasons_31_45.head(100)
print("Top churn reasons for customers aged 31-45:\n", top_reasons_31_45)

Total churn reasons (customers) in age group 31-45: 140
Top churn reasons for customers aged 31-45:
 Churn Reason
Competitor had better devices                31
Competitor made better offer                 17
Attitude of support person                   16
Competitor offered higher download speeds    11
Don't know                                   10
Competitor offered more data                  9
Attitude of service provider                  7
Moved                                         6
Product dissatisfaction                       6
Service dissatisfaction                       5
Network reliability                           5
Lack of self-service on Website               5
Lack of affordable download/upload speed      3
Extra data charges                            2
Limited range of services                     2
Long distance charges                         2
Poor expertise of online support              1
Poor expertise of phone support               1
Price too high                                1
Name: count, dtype: int64

age_group_46_60_df = df[(df['Age'] >= 46) & (df['Age'] <= 60)]
churn_reasons_46_60 = age_group_46_60_df['Churn Reason'].value_counts()

total_churn_reasons_46_60 = churn_reasons_46_60.sum()
print("\nTotal churn reasons (customers) in age group 46-60:", total_churn_reasons_46_60)
top_reasons_46_60 = churn_reasons_46_60.head(100)
print("Top churn reasons for customers aged 46-60:\n", top_reasons_46_60)

Total churn reasons (customers) in age group 46-60: 179
Top churn reasons for customers aged 46-60:
 Churn Reason
Competitor had better devices                33
Competitor made better offer                 23
Attitude of support person                   22
Competitor offered higher download speeds    16
Competitor offered more data                 14
Attitude of service provider                 11
Price too high                                9
Network reliability                           8
Moved                                         8
Don't know                                    8
Product dissatisfaction                       6
Lack of affordable download/upload speed      4
Extra data charges                            4
Poor expertise of online support              4
Service dissatisfaction                       4
Long distance charges                         2
Lack of self-service on Website               2
Limited range of services                     1
Name: count, dtype: int64

age_group_61plus_df = df[df['Age'] >= 61]
churn_reasons_61plus = age_group_61plus_df['Churn Reason'].value_counts()

total_churn_reasons_61plus = churn_reasons_61plus.sum()
print("\nTotal churn reasons (customers) in age group 61+:", total_churn_reasons_61plus)
top_reasons_61plus = churn_reasons_61plus.head(100)
print("Top churn reasons for customers aged 61+:\n", top_reasons_61plus)

Total churn reasons (customers) in age group 61+: 219
Top churn reasons for customers aged 61+:
 Churn Reason
Competitor made better offer                 59
Competitor had better devices                31
Attitude of support person                   22
Don't know                                   14
Price too high                               13
Product dissatisfaction                      10
Competitor offered more data                 10
Attitude of service provider                 10
Extra data charges                            9
Limited range of services                     7
Long distance charges                         7
Competitor offered higher download speeds     6
Service dissatisfaction                       6
Network reliability                           6
Lack of affordable download/upload speed      5
Moved                                         2
Poor expertise of online support              2
Name: count, dtype: int64

# ========== 2. Top Churn Reasons by Tenure Group ==========
tenure_reason = churned_df.groupby(['Tenure Group', 'Churn Reason'], observed=True).size().reset_index(name='Count')
top_tenure_reasons = tenure_reason.sort_values(['Tenure Group', 'Count'], ascending=[True, False])

plt.figure(figsize=(12, 6))
sns.barplot(data=top_tenure_reasons, x='Tenure Group', y='Count', hue='Churn Reason', dodge=False)
plt.title('Top Churn Reasons by Tenure in Months')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

tenure_group_0_12_df = df[(df['Tenure in Months'] >= 0) & (df['Tenure in Months'] <= 12)]
churn_reasons_0_12 = tenure_group_0_12_df['Churn Reason'].value_counts()

total_churn_reasons_0_12 = churn_reasons_0_12.sum()
print("\nTotal churn reasons (customers) in tenure group 0-12:", total_churn_reasons_0_12)
top_reasons_0_12 = churn_reasons_0_12.head(100)
print("Top churn reasons for customers (0-12 months):\n", top_reasons_0_12)

Total churn reasons (customers) in tenure group 0-12: 372
Top churn reasons for customers (0-12 months):
 Churn Reason
Competitor made better offer                 74
Competitor had better devices                69
Attitude of support person                   46
Attitude of service provider                 28
Competitor offered more data                 22
Don't know                                   22
Competitor offered higher download speeds    21
Network reliability                          17
Moved                                        10
Product dissatisfaction                       9
Price too high                                9
Lack of affordable download/upload speed      8
Service dissatisfaction                       8
Extra data charges                            8
Long distance charges                         5
Lack of self-service on Website               5
Limited range of services                     4
Poor expertise of online support              4
Poor expertise of phone support               2
Deceased                                      1
Name: count, dtype: int64

tenure_group_13_24_df = df[(df['Tenure in Months'] >= 13) & (df['Tenure in Months'] <= 24)]
churn_reasons_13_24 = tenure_group_13_24_df['Churn Reason'].value_counts()

total_churn_reasons_13_24 = churn_reasons_13_24.sum()
print("\nTotal churn reasons (customers) in tenure group 13-24:", total_churn_reasons_13_24)
top_reasons_13_24 = churn_reasons_13_24.head(100)
print("Top churn reasons for customers (13-24 months):\n", top_reasons_13_24)

Total churn reasons (customers) in tenure group 13-24: 106
Top churn reasons for customers (13-24 months):
 Churn Reason
Competitor had better devices                20
Attitude of support person                   15
Competitor made better offer                 12
Competitor offered more data                  7
Don't know                                    6
Service dissatisfaction                       6
Competitor offered higher download speeds     5
Attitude of service provider                  5
Network reliability                           4
Limited range of services                     4
Lack of self-service on Website               4
Product dissatisfaction                       3
Long distance charges                         3
Price too high                                3
Moved                                         3
Extra data charges                            3
Lack of affordable download/upload speed      2
Poor expertise of online support              1
Name: count, dtype: int64

tenure_group_25_36_df = df[(df['Tenure in Months'] >= 25) & (df['Tenure in Months'] <= 36)]
churn_reasons_25_36 = tenure_group_25_36_df['Churn Reason'].value_counts()

total_churn_reasons_25_36 = churn_reasons_25_36.sum()
print("\nTotal churn reasons (customers) in tenure group 25-36:", total_churn_reasons_25_36)
top_reasons_25_36 = churn_reasons_25_36.head(100)
print("Top churn reasons for customers (25-36 months):\n", top_reasons_25_36)

Total churn reasons (customers) in tenure group 25-36: 66
Top churn reasons for customers (25-36 months):
 Churn Reason
Competitor made better offer                 14
Attitude of support person                    9
Competitor had better devices                 7
Competitor offered higher download speeds     5
Extra data charges                            4
Competitor offered more data                  4
Price too high                                4
Don't know                                    4
Product dissatisfaction                       3
Service dissatisfaction                       3
Attitude of service provider                  2
Poor expertise of online support              2
Moved                                         2
Long distance charges                         1
Limited range of services                     1
Network reliability                           1
Name: count, dtype: int64

tenure_group_37plus_df = df[df['Age'] >= 37]
churn_reasons_37plus = tenure_group_37plus_df['Churn Reason'].value_counts()

total_churn_reasons_37plus = churn_reasons_37plus.sum()
print("\nTotal churn reasons (customers) in tenure group 37+:", total_churn_reasons_37plus)
top_reasons_37plus = churn_reasons_37plus.head(100)
print("Top churn reasons for customers (37+ months):\n", top_reasons_37plus)

Total churn reasons (customers) in tenure group 37+: 481
Top churn reasons for customers (37+ months):
 Churn Reason
Competitor made better offer                 89
Competitor had better devices                84
Attitude of support person                   53
Competitor offered higher download speeds    30
Competitor offered more data                 28
Don't know                                   27
Attitude of service provider                 27
Price too high                               23
Product dissatisfaction                      20
Network reliability                          18
Moved                                        14
Extra data charges                           14
Lack of affordable download/upload speed     11
Long distance charges                        11
Service dissatisfaction                      10
Limited range of services                     9
Poor expertise of online support              7
Lack of self-service on Website               5
Poor expertise of phone support               1
Name: count, dtype: int64

# ========== 3. Top Churn Reasons by Marital Status ==========
marital_reason = churned_df.groupby(['Married', 'Churn Reason'], observed=True).size().reset_index(name='Count')
top_marital_reasons = marital_reason.sort_values(['Married', 'Count'], ascending=[True, False])

plt.figure(figsize=(10, 6))
sns.barplot(data=top_marital_reasons, x='Married', y='Count', hue='Churn Reason', dodge=False)
plt.title('Top Churn Reasons by Marital Status')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

married_no_df = df[df['Married'] == 'No']
churn_reasons_no = married_no_df['Churn Reason'].value_counts()

total_churn_reasons_no = churn_reasons_no.sum()
print("\nTotal churn reasons of non-married customers:", total_churn_reasons_no)
top_reasons_no = churn_reasons_no.head(100)
print("Top churn reasons for non-married customers:\n", top_reasons_no)

Total churn reasons of non-married customers: 436
Top churn reasons for non-married customers:
 Churn Reason
Competitor made better offer                 79
Competitor had better devices                74
Attitude of support person                   54
Attitude of service provider                 31
Don't know                                   29
Competitor offered more data                 28
Competitor offered higher download speeds    27
Price too high                               17
Network reliability                          16
Service dissatisfaction                      13
Moved                                        12
Lack of affordable download/upload speed     11
Product dissatisfaction                       9
Long distance charges                         8
Extra data charges                            8
Lack of self-service on Website               7
Poor expertise of online support              7
Limited range of services                     3
Poor expertise of phone support               2
Deceased                                      1
Name: count, dtype: int64

married_yes_df = df[df['Married'] == 'Yes']
churn_reasons_yes = married_yes_df['Churn Reason'].value_counts()

total_churn_reasons_yes = churn_reasons_yes.sum()
print("\nTotal churn reasons of married customers:", total_churn_reasons_yes)
top_reasons_yes = churn_reasons_yes.head(100)
print("Top churn reasons for married customers:\n", top_reasons_yes)

Total churn reasons of married customers: 245
Top churn reasons for married customers:
 Churn Reason
Competitor made better offer                 43
Competitor had better devices                40
Attitude of support person                   24
Competitor offered more data                 22
Product dissatisfaction                      16
Competitor offered higher download speeds    13
Attitude of service provider                 13
Network reliability                          13
Don't know                                    9
Extra data charges                            9
Price too high                                9
Limited range of services                     8
Moved                                         7
Service dissatisfaction                       7
Long distance charges                         5
Lack of affordable download/upload speed      4
Lack of self-service on Website               2
Poor expertise of online support              1
Name: count, dtype: int64

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
file_path = '/content/drive/MyDrive/Portfolios/Data-Analyst/Gladys-Lacia-for-Data-Analyst-role/datasets/dataset-Customer-Churn-Telco-California-from-April-to-June-2022.csv'

df = pd.read_csv(file_path, low_memory=False)
df_raw = df.copy()

# --- Combining Internet Type and Streaming Usage Counts (Before Cleaning) ---
internet_type_counts_before = df['Internet Type'].value_counts()
streaming_counts_before = df[['Streaming TV', 'Streaming Movies', 'Streaming Music']].apply(lambda x: x == 'Yes').sum()

counts_before_df = pd.concat([internet_type_counts_before, streaming_counts_before])
counts_before_df = counts_before_df.rename_axis('Internet Type / Streaming Usage').reset_index(name='Total Customers')

print("\n--- Internet Type & Streaming Usage BEFORE Data Cleaning ---\n")
print(counts_before_df.to_string(index=False))

total_customers_by_internet_type = internet_type_counts_before.copy()

# --- Data Cleaning ---
df = df.dropna(subset=['Churn Reason', 'Offer', 'Internet Type', 'Avg Monthly Long Distance Charges'])
churned_df = df[df['Customer Status'] == 'Churned']
streaming_cols = ['Streaming TV', 'Streaming Movies', 'Streaming Music']

# --- Combining Internet Type and Streaming Usage Counts (After Cleaning) ---
internet_type_counts_after = df['Internet Type'].value_counts()
streaming_counts_after = df[['Streaming TV', 'Streaming Movies', 'Streaming Music']].apply(lambda x: x == 'Yes').sum()

counts_after_df = pd.concat([internet_type_counts_after, streaming_counts_after])
counts_after_df = counts_after_df.rename_axis('Internet Type / Streaming Usage').reset_index(name='Churned Customers')

print("\n--- Internet Type & Streaming Usage AFTER Data Cleaning ---\n")
print(counts_after_df.to_string(index=False))

churned_customers_by_internet_type = churned_df['Internet Type'].value_counts()
churned_customers_by_streaming_tv = churned_df['Streaming TV'].value_counts()
churned_customers_by_streaming_movies = churned_df['Streaming Movies'].value_counts()
churned_customers_by_streaming_music = churned_df['Streaming Music'].value_counts()

churn_rate_internet_type = (churned_customers_by_internet_type / total_customers_by_internet_type * 100).round(2)
churn_rate_by_internet_type = pd.DataFrame({
    'Total Customers': total_customers_by_internet_type,
    'Churned Customers': churned_customers_by_internet_type,
    'Churn Rate (%)': churn_rate_internet_type
}).fillna(0).astype({'Churned Customers': 'int'})

streaming_total_before = df_raw[['Streaming TV', 'Streaming Movies', 'Streaming Music']].apply(lambda x: x == 'Yes').sum()
streaming_churned_after = churned_df[['Streaming TV', 'Streaming Movies', 'Streaming Music']].apply(lambda x: x == 'Yes').sum()
churn_rate_streaming_usage = ((streaming_churned_after / streaming_total_before) * 100).round(2)

streaming_df = pd.DataFrame({
    'Total Customers': streaming_total_before,
    'Churned Customers': streaming_churned_after,
    'Churn Rate (%)': churn_rate_streaming_usage
}).astype({'Churned Customers': 'int'})

churn_rate_by_internet_type_and_streaming_usage = pd.concat([
    churn_rate_by_internet_type,
    streaming_df
])

print("\nChurn Rate by Internet Type & Streaming Usage:\n", churn_rate_by_internet_type_and_streaming_usage)

# --- Internet Type Data ---
internet_type_total = df_raw['Internet Type'].value_counts()
internet_type_churned = churned_df['Internet Type'].value_counts()
internet_type_df = pd.DataFrame({
    'Churned': internet_type_churned,
    'Total': internet_type_total
}).fillna(0).astype(int)
internet_type_df['Retained'] = internet_type_df['Total'] - internet_type_df['Churned']
internet_type_df['Churn Rate (%)'] = (internet_type_df['Churned'] / internet_type_df['Total'] * 100).round(2)

# --- Streaming Usage Data ---
streaming_total = df_raw[streaming_cols].apply(lambda x: x == 'Yes').sum()
streaming_churned = churned_df[streaming_cols].apply(lambda x: x == 'Yes').sum()
streaming_df = pd.DataFrame({
    'Churned': streaming_churned,
    'Total': streaming_total
}).astype(int)
streaming_df['Retained'] = streaming_df['Total'] - streaming_df['Churned']
streaming_df['Churn Rate (%)'] = (streaming_df['Churned'] / streaming_df['Total'] * 100).round(2)

# --- Plotting Stacked Bars ---
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

# Internet Type Stack Bars
internet_type_df[['Churned', 'Retained']].plot(
    kind='bar',
    stacked=True,
    ax=axes[0],
    color=['lime', 'limegreen'], rot=0
)
axes[0].set_title("Churn Rate by Internet Type")
axes[0].set_ylabel("Number of Customers")
axes[0].set_xlabel(" ")
axes[0].legend(["Churned", "Retained"])

# churn rate percentages in Internet Type
for idx, row in internet_type_df.iterrows():
    axes[0].text(
        x=internet_type_df.index.get_loc(idx),
        y=row['Churned'] + 20,
        s=f"{row['Churn Rate (%)']}%",
        ha='center',
        fontsize=10,
        fontweight='bold'
    )

# Streaming Usage Stack Bars
streaming_df[['Churned', 'Retained']].plot(
    kind='bar',
    stacked=True,
    ax=axes[1],
    color=['yellow', 'gold'], rot=0
)
axes[1].set_title("Churn Rate by Streaming Usage")
axes[1].set_xlabel(" ")
axes[1].legend(["Churned", "Retained"])

# churn rate percentages in Streaming Usage
for idx, row in streaming_df.iterrows():
    axes[1].text(
        x=streaming_df.index.get_loc(idx),
        y=row['Churned'] + 20,
        s=f"{row['Churn Rate (%)']}%",
        ha='center',
        fontsize=10,
        fontweight='bold'
    )

plt.tight_layout()
plt.show()

Mounted at /content/drive

--- Internet Type & Streaming Usage BEFORE Data Cleaning ---

Internet Type / Streaming Usage  Total Customers
                    Fiber Optic             3035
                            DSL             1652
                          Cable              830
                   Streaming TV             2707
               Streaming Movies             2732
                Streaming Music             2488

--- Internet Type & Streaming Usage AFTER Data Cleaning ---

Internet Type / Streaming Usage  Churned Customers
                    Fiber Optic                526
                            DSL                 86
                          Cable                 69
                   Streaming TV                313
               Streaming Movies                317
                Streaming Music                285

Churn Rate by Internet Type & Streaming Usage:
                   Total Customers  Churned Customers  Churn Rate (%)
Fiber Optic                  3035                526           17.33
DSL                          1652                 86            5.21
Cable                         830                 69            8.31
Streaming TV                 2707                313           11.56
Streaming Movies             2732                317           11.60
Streaming Music              2488                285           11.45

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

churned_df = df[df['Customer Status'] == 'Churned']
churned_df = churned_df.dropna(subset=['Churn Category', 'Tenure in Months'])

median_values = churned_df.groupby('Churn Category')['Tenure in Months'].median()
mean_values = churned_df.groupby('Churn Category')['Tenure in Months'].mean()


sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=churned_df,
    x='Churn Category',
    y='Tenure in Months',
    color='yellow'
)
for i, category in enumerate(median_values.index):
    # median
    plt.scatter(i, median_values[category], color='teal', zorder=10, label='Median' if i == 0 else "")
    plt.text(i, median_values[category] + 1, f"Median: {median_values[category]:.1f}",
             ha='center', va='bottom', color='teal', fontsize=9)
    # mean
    plt.scatter(i, mean_values[category], color='red', zorder=10, label='Mean' if i == 0 else "")
    plt.text(i, mean_values[category] + 1, f"Mean: {mean_values[category]:.1f}",
             ha='center', va='bottom', color='red', fontsize=9)

plt.title('Customer Tenure by Churn Category', fontsize=14)
plt.xlabel('Churn Category', fontsize=12)
plt.ylabel('Tenure in Months', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import pandas as pd

# selected features based on importance
selected_features = [
    'Contract',
    'Tenure in Months',
    'Monthly Charge',
    'Internet Type',
    'Online Security',
    'Offer',
    'Payment Method',
    'Streaming TV',
    'Streaming Movies',
    'Total Refunds',
    'Total Extra Data Charges'
]

# filter dataset to include only churned customers
churned_data = df[df['Customer Status'] == 'Churned']

X = churned_data[selected_features]
y = churned_data['Churn Category']

# define feature types
numerical = ['Tenure in Months', 'Monthly Charge', 'Total Refunds', 'Total Extra Data Charges']
categorical = list(set(selected_features) - set(numerical))

# preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(drop='first'), categorical)
])

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# pipeline with OneVsRestClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced')))
])

# train and predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# confusion matrix
labels = model.named_steps['classifier'].classes_
cm = confusion_matrix(y_test, y_pred, labels=labels)

# display
fig, ax = plt.subplots(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap='Greens', ax=ax, xticks_rotation=45)
ax.grid(False)

plt.title("Churn Category Prediction by Multi-class Classification Model with One-vs-Rest (OvR) Logistic Regression")
plt.tight_layout()
plt.show()

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy Score: {accuracy:.2%}")

Accuracy Score: 22.63%

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# define and fit a RandomForestClassifier since pipeline was not used for training
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(model.named_steps['preprocessor'].transform(X_train), y_train)

# Get feature importances
importances = rf_classifier.feature_importances_

# Get encoded feature names
ohe = model.named_steps['preprocessor'].named_transformers_['cat']
encoded_cat_cols = ohe.get_feature_names_out()

# Combine feature names
all_features = numerical + list(encoded_cat_cols)  # using numerical from previous cell

# Create importance DataFrame
importance_df = pd.DataFrame({
    'Feature': all_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, color='seagreen')
plt.title("Feature Importance by Random Forest Classifier for Customer Churn Patterns")
plt.xlabel("Importance Levels")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.2%}")

                        Feature  Importance
1                Monthly Charge    0.424992
0              Tenure in Months    0.222629
3      Total Extra Data Charges    0.052282
10   Payment Method_Credit Card    0.037085
16         Streaming Movies_Yes    0.035567
13             Streaming TV_Yes    0.035337
2                 Total Refunds    0.029555
12          Online Security_Yes    0.028108
15    Internet Type_Fiber Optic    0.024585
14            Internet Type_DSL    0.017279
11  Payment Method_Mailed Check    0.016869
8             Contract_One Year    0.016689
6                 Offer_Offer D    0.015649
5                 Offer_Offer C    0.013337
7                 Offer_Offer E    0.011715
4                 Offer_Offer B    0.011628
9             Contract_Two Year    0.006695

Accuracy Score: 22.63%

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ovr_clf = model.named_steps['classifier']
estimators = ovr_clf.estimators_

ohe = model.named_steps['preprocessor'].named_transformers_['cat']
encoded_cat_cols = ohe.get_feature_names_out()

all_features = numerical + list(encoded_cat_cols)

coef_df = pd.DataFrame(
    [est.coef_.flatten() for est in estimators],
    columns=all_features,
    index=ovr_clf.classes_
).T
coef_df['mean_abs'] = coef_df.abs().mean(axis=1)
coef_df_sorted = coef_df.sort_values(by='mean_abs', ascending=True).drop('mean_abs', axis=1)

plt.figure(figsize=(14, 10))
coef_df_sorted.plot(kind='barh', figsize=(14, 10), colormap='Paired')
plt.title("Logistic Regression Coefficients by Churn Category")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.tight_layout()
plt.legend(title="Churn Category", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(False)
plt.show()

<Figure size 1400x1000 with 0 Axes>

Customer Churn Analysis of a Telecommunications company in California from April to June 2022¶

Rationale¶

Analysis 1: Churn Rate by Contract Type¶

Insights¶

Recommendations¶

Analysis 2: Top Churn Reasons by Customer Segment¶

Insights¶

Recommendations¶

Analysis 3: Churn Rate by Internet Type & Streaming Usage¶

Insights¶

Recommendations¶

Analysis 4: Tenure in Months vs. Churn Category¶

Insights¶

Recommendations¶

Analysis 5: Churn Category Prediction with Logistic Regression¶

Insights¶

Recommendations¶

Notes¶

Data and Model Problem Oversights and Future Solutions¶