!pip install scikit-learn

Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.12/site-packages (1.4.2)
Requirement already satisfied: numpy>=1.19.5 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (2.2.0)

import pandas as pd # for working with data
import numpy as np # for working with data
import seaborn as sns # for making visualizations
from matplotlib import pyplot as plt # for making visualizations

mushrooms = [
    {"cap_diameter": 15, "cap_color": "purple", "stem_width": 2, "has_skull":True, 'poisonous':True},
    {"cap_diameter": 25, "cap_color": "orange", "stem_width": 5, "has_skull":True, 'poisonous':True},
    {"cap_diameter": 3, "cap_color": "green", "stem_width": 6, "has_skull":False, 'poisonous':False},
    {"cap_diameter": 8, "cap_color": "green", "stem_width": 3, "has_skull":False, 'poisonous':False},
    {"cap_diameter": 55, "cap_color": "green", "stem_width": 35, "has_skull":False, 'poisonous':True},
    {"cap_diameter": 7, "cap_color": "purple", "stem_width": 6, "has_skull":False, 'poisonous':True},
    {"cap_diameter": 3, "cap_color": "purple", "stem_width": 8, "has_skull":True, 'poisonous':True},
    {"cap_diameter": 35, "cap_color": "green", "stem_width": 4, "has_skull":False, 'poisonous':True},
    {"cap_diameter": 23, "cap_color": "green", "stem_width": 5, "has_skull":False, 'poisonous':False}
]

df = pd.DataFrame(mushrooms)

df

sns.set_theme(palette="Accent")

# xkcd style :) 
sns.set_style('white')
#plt.xkcd();

# TODO
df.isnull().sum()

cap_diameter    0
cap_color       0
stem_width      0
has_skull       0
poisonous       0
dtype: int64

# TODO. Do something to answer this question. 
# A visualization or describing the data in some way (don't just manually count though!)
sns.countplot(data=df, x="poisonous")
plt.show()

sns.countplot(x = 'cap_color', data=df, hue = 'poisonous')

<Axes: xlabel='cap_color', ylabel='count'>

#TODO
sns.countplot(data=df, x="cap_diameter", hue='poisonous')

<Axes: xlabel='cap_diameter', ylabel='count'>

df.dtypes

cap_diameter     int64
cap_color       object
stem_width       int64
has_skull         bool
poisonous         bool
dtype: object

# TODO
df = pd.get_dummies(data=df, columns=['cap_color'])

df.head()

# TODO. Determine what is X (our features) and what is y (our target)
X = df.drop('poisonous', axis=1)
y = df['poisonous']

X.shape, y.shape

((9, 6), (9,))

X

y

0     True
1     True
2    False
3    False
4     True
5     True
6     True
7     True
8    False
Name: poisonous, dtype: bool

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7, 6), (2, 6), (7,), (2,))

from sklearn import tree

# TODO
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

# TODO
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

# TODO
predicted = dtc.predict(X_test)

predicted

array([ True, False])

# TODO
dtc.score(X_test, y_test)

1.0

import matplotlib.pyplot as plt

tree.plot_tree(dtc, feature_names=X.columns, class_names=y.unique().astype(str).tolist(), rounded=True, filled=True)

[Text(0.4, 0.8333333333333334, 'cap_color_green <= 0.5\ngini = 0.408\nsamples = 7\nvalue = [2, 5]\nclass = False'),
 Text(0.2, 0.5, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = False'),
 Text(0.6, 0.5, 'cap_diameter <= 29.0\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = True'),
 Text(0.4, 0.16666666666666666, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = True'),
 Text(0.8, 0.16666666666666666, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = False')]

# TODO
fi = dtc.feature_importances_ #feature importance array
fi = pd.Series(data = fi, index = X.columns) #convert to Pandas series for plotting
fi.sort_values(ascending=False, inplace=True) #sort descending
plt.figure(figsize=(12, 6))
chart = sns.barplot(x=fi, y=fi.index, palette=sns.color_palette("BuGn_r", n_colors=len(fi)))
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

/var/folders/b4/905cbxgs1d1_9s92t1p978hw0000gn/T/ipykernel_49217/53131498.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  chart = sns.barplot(x=fi, y=fi.index, palette=sns.color_palette("BuGn_r", n_colors=len(fi)))
/var/folders/b4/905cbxgs1d1_9s92t1p978hw0000gn/T/ipykernel_49217/53131498.py:7: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

df = pd.read_csv("./data/mushrooms.csv")

df.head()

df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

df.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

sns.countplot(data=df, x="class")

<Axes: xlabel='class', ylabel='count'>

df['class'].value_counts()

class
e    4208
p    3916
Name: count, dtype: int64

sns.countplot(x = 'cap-color', data=df, hue = 'class')

<Axes: xlabel='cap-color', ylabel='count'>

sns.countplot(x='cap-shape', data=df, hue='class')

<Axes: xlabel='cap-shape', ylabel='count'>

df.dtypes

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

# TODO
columns = df.columns.drop('class')
df = pd.get_dummies(data=df, columns=columns)

df

# TODO
X = df.drop('class', axis=1)
y = df['class']

X

y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4062, 117), (4062,), (4062, 117), (4062,))

dtc.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

dtc.score(X_test, y_test)

1.0

tree.plot_tree(dtc, feature_names=X.columns, class_names=y.unique().astype(str).tolist(), rounded=True, filled=True)

[Text(0.5882352941176471, 0.9375, 'odor_n <= 0.5\ngini = 0.5\nsamples = 4062\nvalue = [2065, 1997]\nclass = p'),
 Text(0.35294117647058826, 0.8125, 'stalk-root_c <= 0.5\ngini = 0.276\nsamples = 2320\nvalue = [383, 1937]\nclass = e'),
 Text(0.23529411764705882, 0.6875, 'stalk-root_r <= 0.5\ngini = 0.12\nsamples = 2051\nvalue = [132, 1919]\nclass = e'),
 Text(0.17647058823529413, 0.5625, 'odor_a <= 0.5\ngini = 0.043\nsamples = 1962\nvalue = [43, 1919]\nclass = e'),
 Text(0.11764705882352941, 0.4375, 'odor_l <= 0.5\ngini = 0.021\nsamples = 1940\nvalue = [21, 1919]\nclass = e'),
 Text(0.058823529411764705, 0.3125, 'gini = 0.0\nsamples = 1919\nvalue = [0, 1919]\nclass = e'),
 Text(0.17647058823529413, 0.3125, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]\nclass = p'),
 Text(0.23529411764705882, 0.4375, 'gini = 0.0\nsamples = 22\nvalue = [22, 0]\nclass = p'),
 Text(0.29411764705882354, 0.5625, 'gini = 0.0\nsamples = 89\nvalue = [89, 0]\nclass = p'),
 Text(0.47058823529411764, 0.6875, 'stalk-surface-above-ring_s <= 0.5\ngini = 0.125\nsamples = 269\nvalue = [251, 18]\nclass = p'),
 Text(0.4117647058823529, 0.5625, 'gini = 0.0\nsamples = 18\nvalue = [0, 18]\nclass = e'),
 Text(0.5294117647058824, 0.5625, 'gini = 0.0\nsamples = 251\nvalue = [251, 0]\nclass = p'),
 Text(0.8235294117647058, 0.8125, 'spore-print-color_r <= 0.5\ngini = 0.067\nsamples = 1742\nvalue = [1682.0, 60.0]\nclass = p'),
 Text(0.7647058823529411, 0.6875, 'stalk-surface-below-ring_y <= 0.5\ngini = 0.029\nsamples = 1707\nvalue = [1682, 25]\nclass = p'),
 Text(0.6470588235294118, 0.5625, 'cap-surface_g <= 0.5\ngini = 0.007\nsamples = 1681\nvalue = [1675, 6]\nclass = p'),
 Text(0.5882352941176471, 0.4375, 'cap-shape_c <= 0.5\ngini = 0.002\nsamples = 1677\nvalue = [1675.0, 2.0]\nclass = p'),
 Text(0.5294117647058824, 0.3125, 'gill-size_b <= 0.5\ngini = 0.001\nsamples = 1676\nvalue = [1675, 1]\nclass = p'),
 Text(0.47058823529411764, 0.1875, 'bruises_f <= 0.5\ngini = 0.021\nsamples = 94\nvalue = [93, 1]\nclass = p'),
 Text(0.4117647058823529, 0.0625, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = e'),
 Text(0.5294117647058824, 0.0625, 'gini = 0.0\nsamples = 93\nvalue = [93, 0]\nclass = p'),
 Text(0.5882352941176471, 0.1875, 'gini = 0.0\nsamples = 1582\nvalue = [1582, 0]\nclass = p'),
 Text(0.6470588235294118, 0.3125, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = e'),
 Text(0.7058823529411765, 0.4375, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = e'),
 Text(0.8823529411764706, 0.5625, 'gill-size_n <= 0.5\ngini = 0.393\nsamples = 26\nvalue = [7, 19]\nclass = e'),
 Text(0.8235294117647058, 0.4375, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = p'),
 Text(0.9411764705882353, 0.4375, 'gini = 0.0\nsamples = 19\nvalue = [0, 19]\nclass = e'),
 Text(0.8823529411764706, 0.6875, 'gini = 0.0\nsamples = 35\nvalue = [0, 35]\nclass = e')]

fi = dtc.feature_importances_ #feature importance array
fi = pd.Series(data = fi, index = X.columns) #convert to Pandas series for plotting
fi.sort_values(ascending=False, inplace=True) #sort descending
plt.figure(figsize=(12, 24))
chart = sns.barplot(x=fi, y=fi.index, palette=sns.color_palette("BuGn_r", n_colors=len(fi)))
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

/var/folders/b4/905cbxgs1d1_9s92t1p978hw0000gn/T/ipykernel_49217/1380250463.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  chart = sns.barplot(x=fi, y=fi.index, palette=sns.color_palette("BuGn_r", n_colors=len(fi)))
/var/folders/b4/905cbxgs1d1_9s92t1p978hw0000gn/T/ipykernel_49217/1380250463.py:6: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

from sklearn import metrics

predicted = dtc.predict(X_test)

# TODO
metrics.classification_report(y_test, predicted)

'              precision    recall  f1-score   support\n\n           e       1.00      1.00      1.00      2143\n           p       1.00      1.00      1.00      1919\n\n    accuracy                           1.00      4062\n   macro avg       1.00      1.00      1.00      4062\nweighted avg       1.00      1.00      1.00      4062\n'

from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix(y_test, predicted)

array([[2143,    0],
       [   0, 1919]])

Modeling (Scikit-Learn) Exercise¶

Philip Vishnevsky¶

Part 1¶

Import some libraries first¶

Load our data¶

Understand our data¶

Do we have any nulls to deal with?¶

Do we have a balanced dataset?¶

Is there a split among any of the features? Which features seem useful for predicting our target value (poisonous)?¶

Pre-processing: Preparing for our modeling¶

Set X (features) and y (target)¶

Split into train and test¶

Modeling!¶

Evaluate!¶

Let's say we had a new mushroom...¶

What was the most important feature according to this decision tree? What was the next most important feature?¶

Part 2¶

Load the data¶

Data understanding and pre-processing¶

Is out data balanced?¶

Explore the data! Which features look relevant?¶

Convert feature datatypes¶

Get our X and y¶

Split into training and testing sets¶

Modeling!¶

Evaluating¶

	cap_diameter	cap_color	stem_width	has_skull	poisonous
0	15	purple	2	True	True
1	25	orange	5	True	True
2	3	green	6	False	False
3	8	green	3	False	False
4	55	green	35	False	True
5	7	purple	6	False	True
6	3	purple	8	True	True
7	35	green	4	False	True
8	23	green	5	False	False

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	...	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	...	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	...	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	...	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	...	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	...	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	...	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	...	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	...	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	...	s	w	w	p	w	o	e	n	a	g