What should my code be to get the size of my six clusters  Here is my code so far  # Install necessary libraries import pandas as pd import numpy as np import seaborn as sbn from google.colab import files from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt from scipy.stats import skew   # suppress scientific notation in later outputs pd.options.display.float_format = '{:.2f}'.format np.set_printoptions(suppress = True)   # load cluster data and remove any commas so we can process properly df = pd.read_csv('CC GENERAL.csv') df = df.replace(',','',regex = True)   #elbow graph to see how many cluster to do  elbow_kwargs =  {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48} sse = [] for x in range(1,15):     model_chk = KMeans(n_clusters = x, **elbow_kwargs)     model_chk.fit(df)     sse.append(model_chk.inertia_) plt.style.use('fivethirtyeight') plt.plot(range(1,15),sse) plt.xticks(range(1,15)) plt.xlabel('Clusters') plt.ylabel('SSE') plt.show()   #Drop all null values df =df.dropna() df.dropna(inplace=True)   # now we buil out our model for real # using the same hyper parameters as above # only change is solidifying our n_clusters based on the elbow model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48) # run the model on our dataset model_kmean.fit(df)   pd.options.display.max_columns = False # when looking at this dataframe we want to look for places where the values are vastly different from one another # columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic display(cluster_df) # here we can pull which fields have high levels of skew # this shows us the fields where 1 or 2 of the clusters vary signficantly from the others # good place to start for identifying attributes that are distinctive and show clear splits in behavior high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index display(cluster_df[high_skew_fields])       #plotting the results plt.scatter(cluster_df,cluster_df) plt.show()

Database System Concepts
7th Edition
ISBN:9780078022159
Author:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Chapter1: Introduction
Section: Chapter Questions
Problem 1PE
icon
Related questions
Question

What should my code be to get the size of my six clusters 

Here is my code so far 

# Install necessary libraries
import pandas as pd
import numpy as np
import seaborn as sbn
from google.colab import files
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.stats import skew
 
# suppress scientific notation in later outputs
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(suppress = True)
 
# load cluster data and remove any commas so we can process properly
df = pd.read_csv('CC GENERAL.csv')
df = df.replace(',','',regex = True)
 
#elbow graph to see how many cluster to do 
elbow_kwargs =  {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48}

sse = []

for x in range(1,15):
    model_chk = KMeans(n_clusters = x, **elbow_kwargs)
    model_chk.fit(df)
    sse.append(model_chk.inertia_)


plt.style.use('fivethirtyeight')
plt.plot(range(1,15),sse)
plt.xticks(range(1,15))
plt.xlabel('Clusters')
plt.ylabel('SSE')
plt.show()
 
#Drop all null values
df =df.dropna()
df.dropna(inplace=True)
 
# now we buil out our model for real
# using the same hyper parameters as above
# only change is solidifying our n_clusters based on the elbow
model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48)

# run the model on our dataset
model_kmean.fit(df)
 
pd.options.display.max_columns = False

# when looking at this dataframe we want to look for places where the values are vastly different from one another
# columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic
display(cluster_df)

# here we can pull which fields have high levels of skew
# this shows us the fields where 1 or 2 of the clusters vary signficantly from the others
# good place to start for identifying attributes that are distinctive and show clear splits in behavior
high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index
display(cluster_df[high_skew_fields])
 
 
 
#plotting the results
plt.scatter(cluster_df,cluster_df)
plt.show()
AI-Generated Solution
AI-generated content may present inaccurate or offensive content that does not represent bartleby’s views.
steps

Unlock instant AI solutions

Tap the button
to generate a solution

Recommended textbooks for you
Database System Concepts
Database System Concepts
Computer Science
ISBN:
9780078022159
Author:
Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:
McGraw-Hill Education
Starting Out with Python (4th Edition)
Starting Out with Python (4th Edition)
Computer Science
ISBN:
9780134444321
Author:
Tony Gaddis
Publisher:
PEARSON
Digital Fundamentals (11th Edition)
Digital Fundamentals (11th Edition)
Computer Science
ISBN:
9780132737968
Author:
Thomas L. Floyd
Publisher:
PEARSON
C How to Program (8th Edition)
C How to Program (8th Edition)
Computer Science
ISBN:
9780133976892
Author:
Paul J. Deitel, Harvey Deitel
Publisher:
PEARSON
Database Systems: Design, Implementation, & Manag…
Database Systems: Design, Implementation, & Manag…
Computer Science
ISBN:
9781337627900
Author:
Carlos Coronel, Steven Morris
Publisher:
Cengage Learning
Programmable Logic Controllers
Programmable Logic Controllers
Computer Science
ISBN:
9780073373843
Author:
Frank D. Petruzella
Publisher:
McGraw-Hill Education