!pip install numpy pandas matplotlib seaborn

Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (1.26.4)
Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2)
Requirement already satisfied: matplotlib in /opt/anaconda3/lib/python3.12/site-packages (3.8.4)
Requirement already satisfied: seaborn in /opt/anaconda3/lib/python3.12/site-packages (0.13.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2023.3)
Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (1.2.0)
Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (23.2)
Requirement already satisfied: pillow>=8 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (10.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)

import numpy as np

[1,2,3]

[1, 2, 3]

arr1 = np.array([1,2,3])
arr1

array([1, 2, 3])

arr1.shape

(3,)

arr2 = np.array([
    [1,2,3],
    [4,5,6]
])
arr2

array([[1, 2, 3],
       [4, 5, 6]])

#TODO: Write a line of code below to output the shape of the arr2 array. Expected output is (2,3)
arr2.shape

(2, 3)

#TODO: Write some code to create an np.array that has the shape (3,3).
arr3 = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])
arr3.shape

(3, 3)

# Finding the number of dimensions
arr1.ndim

1

arr2.ndim

2

len(arr1.shape)

1

len(arr2.shape)

2

# Finding what kind of number is stored
arr1.dtype

dtype('int64')

np.array([1.4, 1])

array([1.4, 1. ])

import pandas as pd

animals = pd.Series(['dog','cat','frog','bee'])
animals

0     dog
1     cat
2    frog
3     bee
dtype: object

colors = pd.Series(['brown', 'black', 'green', 'yellow'])
colors

0     brown
1     black
2     green
3    yellow
dtype: object

animal_df = pd.DataFrame({
    "animal": animals,
    "color": colors
})
animal_df

#TODO: Create your own dataframe! It can be anything, as long as it has at least 2 columns and 3 rows.
shapes_df = pd.DataFrame({
    "parallelograms": ['square','rectangle','rhombus'],
    "gons": ['hexagon','octagon','decagon'],
    "3d_shapes": ['sphere','cube','cone'],
})
shapes_df

honey_df = pd.read_csv("data/honeyproduction.csv")

honey_df.head()

# get the data type for each column.
honey_df.dtypes

state           object
numcol         float64
yieldpercol      int64
totalprod      float64
stocks         float64
priceperlb     float64
prodvalue      float64
year             int64
dtype: object

# get a list-like object (Index) of the columns. This is an iterable object.
honey_df.columns

Index(['state', 'numcol', 'yieldpercol', 'totalprod', 'stocks', 'priceperlb',
       'prodvalue', 'year'],
      dtype='object')

#TODO get shape of honey_df. Expected output: (626, 8)
honey_df.shape

(626, 8)

honey_df.describe()

# get the column with state information
honey_df['state']

0      AL
1      AZ
2      AR
3      CA
4      CO
       ..
621    VA
622    WA
623    WV
624    WI
625    WY
Name: state, Length: 626, dtype: object

# get the columns with state, priceperlb, and year information
honey_df[['state', 'priceperlb', 'year']]

# let's take a look at our DataFrame for reference.
honey_df.head()

# if we want to LOCate the row with the label 0:
honey_df.loc[0]

state                 AL
numcol           16000.0
yieldpercol           71
totalprod      1136000.0
stocks          159000.0
priceperlb          0.72
prodvalue       818000.0
year                1998
Name: 0, dtype: object

# if we want to locate the row based on position (not necessarily the labeled index)
honey_df.iloc[2]

state                 AR
numcol           53000.0
yieldpercol           65
totalprod      3445000.0
stocks         1688000.0
priceperlb          0.59
prodvalue      2033000.0
year                1998
Name: 2, dtype: object

# we can do ranges as well. they work like normal python slicing syntax
honey_df.iloc[1:4]

# just to prove the point, we can even get the first few even rows:
honey_df.iloc[0:11:2]

# filter the honey_df DataFrame so that we have all rows from NC
honey_df[honey_df['state'] == 'NC']

honey_df['state']

0      AL
1      AZ
2      AR
3      CA
4      CO
       ..
621    VA
622    WA
623    WV
624    WI
625    WY
Name: state, Length: 626, dtype: object

honey_df['state'] == 'NC'

0      False
1      False
2      False
3      False
4      False
       ...  
621    False
622    False
623    False
624    False
625    False
Name: state, Length: 626, dtype: bool

honey_df[honey_df['state'] == 'NC']

#TODO write your own conditional to filter a subset of the DataFrame.
#It can be anything EXCEPT filtering by state. Hint: Try year!
honey_df[honey_df['year'] == 2004]

honey_df = pd.read_csv('data/honeyproduction_withnulls.csv')

honey_df.head()

honey_df.isna().sum()

state          0
numcol         2
yieldpercol    2
totalprod      5
stocks         5
priceperlb     4
prodvalue      3
year           0
dtype: int64

honey_df['totalprod'] = honey_df['totalprod'].fillna(
    honey_df['totalprod'].mean()
)

honey_df.isna().sum()

state          0
numcol         2
yieldpercol    2
totalprod      0
stocks         5
priceperlb     4
prodvalue      3
year           0
dtype: int64

#TODO do the same for priceperlb
honey_df['priceperlb'] = honey_df['priceperlb'].fillna(
    honey_df['priceperlb'].mean()
)
honey_df.isna().sum()

state          0
numcol         2
yieldpercol    2
totalprod      0
stocks         5
priceperlb     0
prodvalue      3
year           0
dtype: int64

len(honey_df)

626

honey_df = honey_df.dropna(axis=0)

len(honey_df)

615

honey_df['priceperoz'] = honey_df['priceperlb'] / 16

honey_df.head()

sampled_df = honey_df.sample(frac=0.23)

len(sampled_df)

141

len(honey_df)

615

honey_df['priceperoz'].apply(lambda p: p * 28.35)

0      1.275750
1      1.134000
2      1.045406
3      1.098563
4      2.501876
         ...   
621    6.679969
622    4.217062
623    5.156156
624    3.632344
625    3.313406
Name: priceperoz, Length: 615, dtype: float64

honey_df['pricepergram'] = honey_df['priceperoz'].apply(lambda p: p * 28.35)

honey_df.head()

import matplotlib.pyplot as plt

# an empty plot
plt.plot()

[]

# plotting a list
plt.plot([1, 2, 3, 4])

[<matplotlib.lines.Line2D at 0x165482a50>]

# plotting two lists against eachother
x = [1, 2, 3, 4]
y = [5, 19, 12, 12]
fig, ax = plt.subplots()
ax.plot(x, y)

[<matplotlib.lines.Line2D at 0x1654d35c0>]

# a more professional looking plot, with saving to png!
# 1. prepare data
x = [1, 2, 3, 4]
y = [13, 53, 39, 2]

# 2. setup plot
fig, ax = plt.subplots(figsize=(20,10))

# 3. plotting data
ax.plot(x, y)

# 4. customize plot
ax.set(title="simple plot", xlabel="x axis", ylabel="yaxis")

# 5. save and show figure
fig.savefig('sample-plot.png')

# it's very easy to use .plot() on a DataFrame.
honey_df.plot(x='year', y='stocks', kind='scatter')

<Axes: xlabel='year', ylabel='stocks'>

#TODO plot your own scatter plot on honey_df:
honey_df.plot(x='year', y='pricepergram', kind='scatter')

<Axes: xlabel='year', ylabel='pricepergram'>

import seaborn as sns

sns.set(rc={"figure.figsize":(20,12)}) # change the figure size to width=20, height=12
sns.lineplot(data=honey_df, x='year', y='stocks', hue='state')

<Axes: xlabel='year', ylabel='stocks'>

sns.pairplot(data=honey_df)

<seaborn.axisgrid.PairGrid at 0x174910560>

#TODO: Change this line plot to a scatter plot. (HINT: Change sns.lineplot)
sns.set(rc={"figure.figsize":(20,12)}) # change the figure size to width=20, height=12
sns.scatterplot(data=honey_df, x='year', y='stocks', hue='state')

<Axes: xlabel='year', ylabel='stocks'>

	numcol	yieldpercol	totalprod	stocks	priceperlb	prodvalue	year
count	626.000000	626.000000	6.260000e+02	6.260000e+02	626.000000	6.260000e+02	626.000000
mean	60284.345048	62.009585	4.169086e+06	1.318859e+06	1.409569	4.715741e+06	2004.864217
std	91077.087231	19.458754	6.883847e+06	2.272964e+06	0.638599	7.976110e+06	4.317306
min	2000.000000	19.000000	8.400000e+04	8.000000e+03	0.490000	1.620000e+05	1998.000000
25%	9000.000000	48.000000	4.750000e+05	1.430000e+05	0.932500	7.592500e+05	2001.000000
50%	26000.000000	60.000000	1.533000e+06	4.395000e+05	1.360000	1.841500e+06	2005.000000
75%	63750.000000	74.000000	4.175250e+06	1.489500e+06	1.680000	4.703250e+06	2009.000000
max	510000.000000	136.000000	4.641000e+07	1.380000e+07	4.150000	6.961500e+07	2012.000000

	state	numcol	yieldpercol	totalprod	stocks	priceperlb	prodvalue	year
27	NC	8000.0	59	472000.0	151000.0	1.38	651000.0	1998
70	NC	9000.0	46	414000.0	104000.0	1.62	671000.0	1999
113	NC	11000.0	49	539000.0	243000.0	1.43	771000.0	2000
156	NC	13000.0	44	572000.0	172000.0	1.48	847000.0	2001
200	NC	16000.0	42	672000.0	74000.0	1.41	948000.0	2002
244	NC	10000.0	44	440000.0	79000.0	1.92	845000.0	2003
287	NC	9000.0	40	360000.0	72000.0	1.93	695000.0	2004
328	NC	10000.0	54	540000.0	146000.0	1.88	1015000.0	2005
369	NC	10000.0	50	500000.0	215000.0	1.57	785000.0	2006
410	NC	12000.0	45	540000.0	76000.0	2.49	1345000.0	2007
451	NC	12000.0	52	624000.0	137000.0	2.18	1360000.0	2008
491	NC	11000.0	45	495000.0	84000.0	2.57	1272000.0	2009
531	NC	13000.0	46	598000.0	144000.0	2.66	1591000.0	2010
571	NC	14000.0	62	868000.0	95000.0	2.83	2456000.0	2011
611	NC	13000.0	39	507000.0	106000.0	3.76	1906000.0	2012

	state	numcol	yieldpercol	totalprod	stocks	priceperlb	prodvalue	year
27	NC	8000.0	59	472000.0	151000.0	1.38	651000.0	1998
70	NC	9000.0	46	414000.0	104000.0	1.62	671000.0	1999
113	NC	11000.0	49	539000.0	243000.0	1.43	771000.0	2000
156	NC	13000.0	44	572000.0	172000.0	1.48	847000.0	2001
200	NC	16000.0	42	672000.0	74000.0	1.41	948000.0	2002
244	NC	10000.0	44	440000.0	79000.0	1.92	845000.0	2003
287	NC	9000.0	40	360000.0	72000.0	1.93	695000.0	2004
328	NC	10000.0	54	540000.0	146000.0	1.88	1015000.0	2005
369	NC	10000.0	50	500000.0	215000.0	1.57	785000.0	2006
410	NC	12000.0	45	540000.0	76000.0	2.49	1345000.0	2007
451	NC	12000.0	52	624000.0	137000.0	2.18	1360000.0	2008
491	NC	11000.0	45	495000.0	84000.0	2.57	1272000.0	2009
531	NC	13000.0	46	598000.0	144000.0	2.66	1591000.0	2010
571	NC	14000.0	62	868000.0	95000.0	2.83	2456000.0	2011
611	NC	13000.0	39	507000.0	106000.0	3.76	1906000.0	2012

	state	numcol	yieldpercol	totalprod	stocks	priceperlb	prodvalue	year
261	AL	12000.0	87	1044000.0	282000.0	1.41	1472000.0	2004
262	AZ	32000.0	55	1760000.0	774000.0	1.11	1954000.0	2004
263	AR	40000.0	57	2280000.0	388000.0	0.87	1984000.0	2004
264	CA	390000.0	45	17550000.0	5792000.0	1.05	18428000.0	2004
265	CO	23000.0	80	1840000.0	791000.0	1.35	2484000.0	2004
266	FL	205000.0	98	20090000.0	2009000.0	1.02	20492000.0	2004
267	GA	63000.0	49	3087000.0	648000.0	1.20	3704000.0	2004
268	HI	8000.0	96	768000.0	77000.0	1.59	1221000.0	2004
269	ID	100000.0	63	6300000.0	2520000.0	1.02	6426000.0	2004
270	IL	7000.0	55	385000.0	193000.0	1.86	716000.0	2004
271	IN	7000.0	59	413000.0	145000.0	1.47	607000.0	2004
272	IA	35000.0	67	2345000.0	1337000.0	1.06	2486000.0	2004
273	KS	14000.0	80	1120000.0	683000.0	1.18	1322000.0	2004
274	KY	5000.0	56	280000.0	34000.0	1.96	549000.0	2004
275	LA	35000.0	98	3430000.0	240000.0	0.79	2710000.0	2004
276	ME	7000.0	31	217000.0	37000.0	1.28	278000.0	2004
277	MI	65000.0	67	4355000.0	2439000.0	1.16	5052000.0	2004
278	MN	135000.0	75	10125000.0	1924000.0	1.08	10935000.0	2004
279	MS	18000.0	65	1170000.0	421000.0	0.79	924000.0	2004
280	MO	16000.0	41	656000.0	151000.0	1.36	892000.0	2004
281	MT	140000.0	77	10780000.0	3773000.0	1.08	11642000.0	2004
282	NE	51000.0	89	4539000.0	2043000.0	1.01	4584000.0	2004
283	NV	14000.0	55	770000.0	316000.0	1.78	1371000.0	2004
284	NJ	12000.0	27	324000.0	45000.0	1.40	454000.0	2004
285	NM	8000.0	44	352000.0	127000.0	1.19	419000.0	2004
286	NY	64000.0	67	4288000.0	1887000.0	1.36	5832000.0	2004
287	NC	9000.0	40	360000.0	72000.0	1.93	695000.0	2004
288	ND	390000.0	78	30420000.0	9126000.0	1.05	31941000.0	2004
289	OH	16000.0	58	928000.0	353000.0	1.53	1420000.0	2004
290	OR	42000.0	54	2268000.0	1111000.0	1.21	2744000.0	2004
291	PA	30000.0	54	1620000.0	810000.0	1.42	2300000.0	2004
292	SD	215000.0	105	22575000.0	13545000.0	1.01	22801000.0	2004
293	TN	6000.0	54	324000.0	91000.0	1.73	561000.0	2004
294	TX	116000.0	76	8816000.0	1411000.0	0.97	8552000.0	2004
295	UT	24000.0	70	1680000.0	554000.0	1.10	1848000.0	2004
296	VT	6000.0	68	408000.0	192000.0	1.51	616000.0	2004
297	VA	7000.0	38	266000.0	69000.0	2.10	559000.0	2004
298	WA	56000.0	63	3528000.0	1376000.0	0.98	3457000.0	2004
299	WV	9000.0	55	495000.0	183000.0	1.41	698000.0	2004
300	WI	68000.0	86	5848000.0	2632000.0	1.19	6959000.0	2004
301	WY	39000.0	75	2925000.0	380000.0	1.10	3218000.0	2004

	state	numcol	yieldpercol	totalprod	stocks	priceperlb	prodvalue	year	priceperoz
0	AL	16000.0	71.0	1.136000e+06	159000.0	0.720000	818000.0	1998	0.045000
1	AZ	55000.0	60.0	3.300000e+06	1485000.0	0.640000	2112000.0	1998	0.040000
2	AR	53000.0	65.0	4.133610e+06	1688000.0	0.590000	2033000.0	1998	0.036875
3	CA	450000.0	83.0	3.735000e+07	12326000.0	0.620000	23157000.0	1998	0.038750
4	CO	27000.0	72.0	1.944000e+06	1594000.0	1.411994	1361000.0	1998	0.088250

Data Understanding and Visualization¶

with Pandas, NumPy, and Matplotlib¶

Philip Vishnevsky¶

Installation¶

NumPy¶

N-Dimensional Arrays¶

Utility properties¶

Pandas¶

Working with data!¶

DataFrame utility properties!¶

Slicing DataFrames¶

Filtering DataFrames with conditionals¶

Working with missing data/nulls!¶

Adding new columns¶

Sampling data¶

Applying functions to a column¶

Matplotlib¶

Working with DataFrames¶

Seaborn¶

	parallelograms	gons	3d_shapes
0	square	hexagon	sphere
1	rectangle	octagon	cube
2	rhombus	decagon	cone

	state	numcol	yieldpercol	totalprod	stocks	priceperlb	prodvalue	year
0	AL	16000.0	71	1136000.0	159000.0	0.72	818000.0	1998
1	AZ	55000.0	60	3300000.0	1485000.0	0.64	2112000.0	1998
2	AR	53000.0	65	3445000.0	1688000.0	0.59	2033000.0	1998
3	CA	450000.0	83	37350000.0	12326000.0	0.62	23157000.0	1998
4	CO	27000.0	72	1944000.0	1594000.0	0.70	1361000.0	1998