# Import data libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Import Models
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC

# Import Metrics
from sklearn.metrics import classification_report, confusion_matrix

# Suppress Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Load the dataset (note: uses significant memory)
path = "./data/used_cars_data.csv"
df = pd.read_csv(path, low_memory=False)

# We will define a function to display some detailed information about our dataframe

def show_info(dataframe):
    # Display first few rows
    print("\nFirst Few Rows\n")
    display(dataframe.head())
    
    # Basic stats
    print("\nBasic Numeric Stats\n")
    display(dataframe.describe())
    
    print("\nShape & Cols\n")
    print(dataframe.shape)
    print(dataframe.columns)
    
    # Show column info
    print("\nDetailed Column Info\n")
    dataframe.info(verbose=True, show_counts=True)

show_info(df)

First Few Rows

Basic Numeric Stats

Shape & Cols

(3000040, 66)
Index(['vin', 'back_legroom', 'bed', 'bed_height', 'bed_length', 'body_type',
       'cabin', 'city', 'city_fuel_economy', 'combine_fuel_economy',
       'daysonmarket', 'dealer_zip', 'description', 'engine_cylinders',
       'engine_displacement', 'engine_type', 'exterior_color', 'fleet',
       'frame_damaged', 'franchise_dealer', 'franchise_make', 'front_legroom',
       'fuel_tank_volume', 'fuel_type', 'has_accidents', 'height',
       'highway_fuel_economy', 'horsepower', 'interior_color', 'isCab',
       'is_certified', 'is_cpo', 'is_new', 'is_oemcpo', 'latitude', 'length',
       'listed_date', 'listing_color', 'listing_id', 'longitude',
       'main_picture_url', 'major_options', 'make_name', 'maximum_seating',
       'mileage', 'model_name', 'owner_count', 'power', 'price', 'salvage',
       'savings_amount', 'seller_rating', 'sp_id', 'sp_name', 'theft_title',
       'torque', 'transmission', 'transmission_display', 'trimId', 'trim_name',
       'vehicle_damage_category', 'wheel_system', 'wheel_system_display',
       'wheelbase', 'width', 'year'],
      dtype='object')

Detailed Column Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000040 entries, 0 to 3000039
Data columns (total 66 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   vin                      3000040 non-null  object 
 1   back_legroom             2840771 non-null  object 
 2   bed                      19568 non-null    object 
 3   bed_height               429098 non-null   object 
 4   bed_length               429098 non-null   object 
 5   body_type                2986497 non-null  object 
 6   cabin                    63533 non-null    object 
 7   city                     3000040 non-null  object 
 8   city_fuel_economy        2508755 non-null  float64
 9   combine_fuel_economy     0 non-null        float64
 10  daysonmarket             3000040 non-null  int64  
 11  dealer_zip               3000040 non-null  object 
 12  description              2922139 non-null  object 
 13  engine_cylinders         2899459 non-null  object 
 14  engine_displacement      2827654 non-null  float64
 15  engine_type              2899459 non-null  object 
 16  exterior_color           2950085 non-null  object 
 17  fleet                    1573445 non-null  object 
 18  frame_damaged            1573445 non-null  object 
 19  franchise_dealer         3000040 non-null  bool   
 20  franchise_make           2427405 non-null  object 
 21  front_legroom            2840771 non-null  object 
 22  fuel_tank_volume         2840771 non-null  object 
 23  fuel_type                2917316 non-null  object 
 24  has_accidents            1573445 non-null  object 
 25  height                   2840771 non-null  object 
 26  highway_fuel_economy     2508755 non-null  float64
 27  horsepower               2827654 non-null  float64
 28  interior_color           2616054 non-null  object 
 29  isCab                    1573445 non-null  object 
 30  is_certified             0 non-null        float64
 31  is_cpo                   182898 non-null   object 
 32  is_new                   3000040 non-null  bool   
 33  is_oemcpo                135362 non-null   object 
 34  latitude                 3000040 non-null  float64
 35  length                   2840771 non-null  object 
 36  listed_date              3000040 non-null  object 
 37  listing_color            3000040 non-null  object 
 38  listing_id               3000040 non-null  int64  
 39  longitude                3000040 non-null  float64
 40  main_picture_url         2630947 non-null  object 
 41  major_options            2799992 non-null  object 
 42  make_name                3000040 non-null  object 
 43  maximum_seating          2840771 non-null  object 
 44  mileage                  2855653 non-null  float64
 45  model_name               3000040 non-null  object 
 46  owner_count              1483027 non-null  float64
 47  power                    2518614 non-null  object 
 48  price                    3000040 non-null  float64
 49  salvage                  1573445 non-null  object 
 50  savings_amount           3000040 non-null  int64  
 51  seller_rating            2959168 non-null  float64
 52  sp_id                    2999944 non-null  float64
 53  sp_name                  3000040 non-null  object 
 54  theft_title              1573445 non-null  object 
 55  torque                   2482247 non-null  object 
 56  transmission             2935855 non-null  object 
 57  transmission_display     2935855 non-null  object 
 58  trimId                   2884213 non-null  object 
 59  trim_name                2883746 non-null  object 
 60  vehicle_damage_category  0 non-null        float64
 61  wheel_system             2853308 non-null  object 
 62  wheel_system_display     2853308 non-null  object 
 63  wheelbase                2840771 non-null  object 
 64  width                    2840771 non-null  object 
 65  year                     3000040 non-null  int64  
dtypes: bool(2), float64(14), int64(4), object(46)
memory usage: 1.4+ GB

# We are only interested in keeping the below 27 columns
cols_to_keep = [
    "back_legroom", "body_type", "city", "city_fuel_economy", "daysonmarket",
    "engine_cylinders", "engine_displacement", "franchise_dealer",
    "front_legroom", "fuel_tank_volume", "fuel_type", "height",
    "highway_fuel_economy", "horsepower", "is_new", "length", "maximum_seating",
    "mileage", "owner_count", "price", "savings_amount", "transmission",
    "wheel_system", "wheelbase", "width", "year", "make_name"
]

df_dropped = df[cols_to_keep].copy()

# Call our earlier function
show_info(df_dropped)

First Few Rows

Basic Numeric Stats

Shape & Cols

(3000040, 27)
Index(['back_legroom', 'body_type', 'city', 'city_fuel_economy',
       'daysonmarket', 'engine_cylinders', 'engine_displacement',
       'franchise_dealer', 'front_legroom', 'fuel_tank_volume', 'fuel_type',
       'height', 'highway_fuel_economy', 'horsepower', 'is_new', 'length',
       'maximum_seating', 'mileage', 'owner_count', 'price', 'savings_amount',
       'transmission', 'wheel_system', 'wheelbase', 'width', 'year',
       'make_name'],
      dtype='object')

Detailed Column Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000040 entries, 0 to 3000039
Data columns (total 27 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   back_legroom          2840771 non-null  object 
 1   body_type             2986497 non-null  object 
 2   city                  3000040 non-null  object 
 3   city_fuel_economy     2508755 non-null  float64
 4   daysonmarket          3000040 non-null  int64  
 5   engine_cylinders      2899459 non-null  object 
 6   engine_displacement   2827654 non-null  float64
 7   franchise_dealer      3000040 non-null  bool   
 8   front_legroom         2840771 non-null  object 
 9   fuel_tank_volume      2840771 non-null  object 
 10  fuel_type             2917316 non-null  object 
 11  height                2840771 non-null  object 
 12  highway_fuel_economy  2508755 non-null  float64
 13  horsepower            2827654 non-null  float64
 14  is_new                3000040 non-null  bool   
 15  length                2840771 non-null  object 
 16  maximum_seating       2840771 non-null  object 
 17  mileage               2855653 non-null  float64
 18  owner_count           1483027 non-null  float64
 19  price                 3000040 non-null  float64
 20  savings_amount        3000040 non-null  int64  
 21  transmission          2935855 non-null  object 
 22  wheel_system          2853308 non-null  object 
 23  wheelbase             2840771 non-null  object 
 24  width                 2840771 non-null  object 
 25  year                  3000040 non-null  int64  
 26  make_name             3000040 non-null  object 
dtypes: bool(2), float64(7), int64(3), object(15)
memory usage: 577.9+ MB

# Drop brands with less than 5000 rows
make_counts = df_dropped['make_name'].value_counts()
valid_makes = make_counts[make_counts >= 5000].index
df_dropped = df_dropped[df_dropped['make_name'].isin(valid_makes)].copy()

# Display shape and remaining brands
print("Shape After Brand Removal:", df_dropped.shape)
print("Remaining makes:", df_dropped['make_name'].unique())

Shape After Brand Removal: (2968284, 27)
Remaining makes: ['Jeep' 'Land Rover' 'Subaru' 'Mazda' 'BMW' 'Hyundai' 'Chevrolet' 'Lexus'
 'Cadillac' 'Chrysler' 'Dodge' 'Mercedes-Benz' 'Nissan' 'Honda' 'Kia'
 'Ford' 'Lincoln' 'Audi' 'Jaguar' 'Volkswagen' 'RAM' 'Porsche' 'Toyota'
 'INFINITI' 'GMC' 'Acura' 'Volvo' 'Mitsubishi' 'Buick' 'MINI']

# engine_cylinders are displayed as an engine configuration.
display(df_dropped['engine_cylinders'].unique())
# We need to extract cylinder counts
df_dropped['engine_cylinders'] = df_dropped['engine_cylinders'] \
    .str.extract(r'(\d+)', expand=False) \
    .astype(float)
# Results
display(df_dropped['engine_cylinders'].unique())

array(['I4', 'H4', 'V6', 'I6', 'V6 Diesel', nan, 'V8', 'V8 Biodiesel',
       'V8 Flex Fuel Vehicle', 'V6 Flex Fuel Vehicle', 'I4 Hybrid',
       'V6 Hybrid', 'I3', 'I5', 'I4 Flex Fuel Vehicle', 'I4 Diesel',
       'I6 Diesel', 'V8 Diesel', 'R2', 'V10', 'V12', 'V6 Biodiesel', 'H6',
       'I2', 'I5 Biodiesel', 'V8 Hybrid', 'H4 Hybrid',
       'V8 Compressed Natural Gas', 'W12', 'I6 Hybrid', 'I5 Diesel', 'W8',
       'I4 Compressed Natural Gas', 'V10 Diesel', 'V8 Propane',
       'I3 Hybrid', 'V6 Compressed Natural Gas'], dtype=object)

array([ 4.,  6., nan,  8.,  3.,  5.,  2., 10., 12.])

# Clean up other numerical features

measurement_cols = [
    'back_legroom', 'front_legroom', 'fuel_tank_volume',
    'height', 'length', 'maximum_seating', 'wheelbase', 'width'
]

# Convert these columns from strings with units to numeric floats
for col in measurement_cols:
    # Extract the first occurrence of a number (including decimals)
    df_dropped[col] = df_dropped[col].astype(str).str.extract(r'([\d\.]+)', expand=False)
    # Convert the extracted string to a float
    df_dropped[col] = pd.to_numeric(df_dropped[col], errors='coerce')
    # Impute missing values with the median of the column
    median_val = df_dropped[col].median()
    df_dropped[col].fillna(median_val, inplace=True)

# Display only cols with null vals, and print num rows w missing vals
missing_values = df_dropped.isnull().sum()
display(missing_values[missing_values > 0])
missing_value_count = missing_values[missing_values > 0].count()
print(f"Number of columns with missing values: {missing_value_count}")

body_type                 12843
city_fuel_economy        481848
engine_cylinders          96393
engine_displacement      164276
fuel_type                 80986
highway_fuel_economy     481848
horsepower               164276
mileage                  143406
owner_count             1508247
transmission              63484
wheel_system             141177
dtype: int64

Number of columns with missing values: 11

# Impute categorical columns with the mode
categorical_cols = [
    'body_type', 'fuel_type', 'transmission', 'wheel_system'
]

for col in categorical_cols:
    mode_val = df_dropped[col].mode()[0]
    df_dropped[col].fillna(mode_val, inplace=True)

# Impute missing engine_cylinders with the mode
mode_value = df_dropped['engine_cylinders'].mode()[0]
df_dropped['engine_cylinders'].fillna(mode_value, inplace=True)

# Impute remaining numerical columns
num_cols = [
    'city_fuel_economy',   # float64
    'daysonmarket',        # int64
    'engine_displacement', # float64
    'highway_fuel_economy',# float64
    'horsepower',          # float64
    'mileage',             # float64
    'owner_count',         # float64
    'price',               # float64
    'savings_amount',      # int64
    'year'                 # int64
]

for col in num_cols:
    # For the 'year' column, we use the mode as it may have a dominant production year.
    if col == 'year':
        impute_value = df_dropped[col].mode()[0]
    else:
        impute_value = df_dropped[col].median()
    
    df_dropped[col].fillna(impute_value, inplace=True)

# Display only cols with null vals, and print num rows w missing vals
missing_values = df_dropped.isnull().sum()
display(missing_values[missing_values > 0])
missing_value_count = missing_values[missing_values > 0].count()
print(f"Number of columns with missing values: {missing_value_count}")

Series([], dtype: int64)

Number of columns with missing values: 0

# Convert boolean columns to integers (0/1)
bool_cols = ['franchise_dealer', 'is_new']
for col in bool_cols:
    df_dropped[col] = df_dropped[col].astype(int)

# Encode the high-cardinality 'city' column using frequency encoding
city_counts = df_dropped['city'].value_counts()
df_dropped['city_freq'] = df_dropped['city'].map(city_counts)
df_dropped.drop('city', axis=1, inplace=True)

# Specify the categorical columns to encode
categorical_cols = ['body_type', 'fuel_type', 'transmission', 'wheel_system']

# One-hot encode these categorical features
# Use drop_first=True to avoid dummy variable trap (avoid multicollinearity)
df_encoded = pd.get_dummies(df_dropped, columns=categorical_cols, drop_first=True)

show_info(df_encoded)

First Few Rows

Basic Numeric Stats

Shape & Cols

(2968284, 45)
Index(['back_legroom', 'city_fuel_economy', 'daysonmarket', 'engine_cylinders',
       'engine_displacement', 'franchise_dealer', 'front_legroom',
       'fuel_tank_volume', 'height', 'highway_fuel_economy', 'horsepower',
       'is_new', 'length', 'maximum_seating', 'mileage', 'owner_count',
       'price', 'savings_amount', 'wheelbase', 'width', 'year', 'make_name',
       'city_freq', 'body_type_Coupe', 'body_type_Hatchback',
       'body_type_Minivan', 'body_type_Pickup Truck',
       'body_type_SUV / Crossover', 'body_type_Sedan', 'body_type_Van',
       'body_type_Wagon', 'fuel_type_Compressed Natural Gas',
       'fuel_type_Diesel', 'fuel_type_Electric', 'fuel_type_Flex Fuel Vehicle',
       'fuel_type_Gasoline', 'fuel_type_Hybrid', 'fuel_type_Propane',
       'transmission_CVT', 'transmission_Dual Clutch', 'transmission_M',
       'wheel_system_4X2', 'wheel_system_AWD', 'wheel_system_FWD',
       'wheel_system_RWD'],
      dtype='object')

Detailed Column Info

<class 'pandas.core.frame.DataFrame'>
Index: 2968284 entries, 0 to 3000039
Data columns (total 45 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   back_legroom                      2968284 non-null  float64
 1   city_fuel_economy                 2968284 non-null  float64
 2   daysonmarket                      2968284 non-null  int64  
 3   engine_cylinders                  2968284 non-null  float64
 4   engine_displacement               2968284 non-null  float64
 5   franchise_dealer                  2968284 non-null  int32  
 6   front_legroom                     2968284 non-null  float64
 7   fuel_tank_volume                  2968284 non-null  float64
 8   height                            2968284 non-null  float64
 9   highway_fuel_economy              2968284 non-null  float64
 10  horsepower                        2968284 non-null  float64
 11  is_new                            2968284 non-null  int32  
 12  length                            2968284 non-null  float64
 13  maximum_seating                   2968284 non-null  float64
 14  mileage                           2968284 non-null  float64
 15  owner_count                       2968284 non-null  float64
 16  price                             2968284 non-null  float64
 17  savings_amount                    2968284 non-null  int64  
 18  wheelbase                         2968284 non-null  float64
 19  width                             2968284 non-null  float64
 20  year                              2968284 non-null  int64  
 21  make_name                         2968284 non-null  object 
 22  city_freq                         2968284 non-null  int64  
 23  body_type_Coupe                   2968284 non-null  bool   
 24  body_type_Hatchback               2968284 non-null  bool   
 25  body_type_Minivan                 2968284 non-null  bool   
 26  body_type_Pickup Truck            2968284 non-null  bool   
 27  body_type_SUV / Crossover         2968284 non-null  bool   
 28  body_type_Sedan                   2968284 non-null  bool   
 29  body_type_Van                     2968284 non-null  bool   
 30  body_type_Wagon                   2968284 non-null  bool   
 31  fuel_type_Compressed Natural Gas  2968284 non-null  bool   
 32  fuel_type_Diesel                  2968284 non-null  bool   
 33  fuel_type_Electric                2968284 non-null  bool   
 34  fuel_type_Flex Fuel Vehicle       2968284 non-null  bool   
 35  fuel_type_Gasoline                2968284 non-null  bool   
 36  fuel_type_Hybrid                  2968284 non-null  bool   
 37  fuel_type_Propane                 2968284 non-null  bool   
 38  transmission_CVT                  2968284 non-null  bool   
 39  transmission_Dual Clutch          2968284 non-null  bool   
 40  transmission_M                    2968284 non-null  bool   
 41  wheel_system_4X2                  2968284 non-null  bool   
 42  wheel_system_AWD                  2968284 non-null  bool   
 43  wheel_system_FWD                  2968284 non-null  bool   
 44  wheel_system_RWD                  2968284 non-null  bool   
dtypes: bool(22), float64(16), int32(2), int64(4), object(1)
memory usage: 583.1+ MB

# Order the car brands by frequency (from most common to least)
plt.figure(figsize=(12, 8))
order = df_dropped['make_name'].value_counts().index
sns.countplot(data=df_dropped, y='make_name', order=order, palette='viridis')
plt.title("Frequency Distribution of Car Brands")
plt.xlabel("Count")
plt.ylabel("Car Brand")
plt.show()

# Select a subset of continuous variables
numeric_cols = ['price', 'mileage', 'engine_displacement', 'horsepower',
                'city_fuel_economy', 'highway_fuel_economy', 'daysonmarket']

corr_matrix = df_encoded[numeric_cols].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap of Selected Features")
plt.show()

# Price distribution with log scale
plt.figure(figsize=(12, 6))
sns.histplot(df_encoded['price'], bins=1000)
plt.xscale('log')
plt.title("Distribution of Price (Log Scale)")
plt.xlabel("Price (log scale)")
plt.ylabel("Frequency")
plt.show()

# Mileage distribution with log scale
plt.figure(figsize=(12, 6))
sns.histplot(df_encoded['mileage'], bins=1000)
plt.xscale('log')
plt.title("Distribution of Mileage (Log Scale)")
plt.xlabel("Mileage (log scale)")
plt.ylabel("Frequency")
plt.show()

# Compute the average price by brand from df_encoded
brand_avg_price = df_encoded.groupby('make_name')['price'].mean().reset_index()

# Sort brands by average price in descending order for better visualization
brand_avg_price = brand_avg_price.sort_values(by='price', ascending=False)

plt.figure(figsize=(12,8))
sns.barplot(x='price', y='make_name', data=brand_avg_price, palette="viridis")
plt.title("Average Car Price by Brand")
plt.xlabel("Average Price")
plt.ylabel("Car Brand")
plt.show()

# Separate features and target
X = df_encoded.drop('make_name', axis=1)
y = df_encoded['make_name']

# Encode the target labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data into training and test sets (using stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Train a Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

RandomForestClassifier(n_jobs=-1, random_state=42)

# Run evaluation
rf_pred = rf_clf.predict(X_test)

# Display classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      6121
           1       1.00      1.00      1.00      7274
           2       0.99      0.99      0.99     12439
           3       1.00      1.00      1.00     12957
           4       0.99      0.99      0.99      9433
           5       0.98      0.99      0.98     75379
           6       0.98      0.98      0.98      7279
           7       0.99      0.98      0.99     18275
           8       0.99      1.00      0.99     95267
           9       0.97      0.91      0.94     19859
          10       1.00      1.00      1.00     42892
          11       0.99      0.99      0.99     27219
          12       1.00      0.99      1.00      5675
          13       1.00      1.00      1.00      1713
          14       0.99      0.99      0.99     33687
          15       0.99      1.00      0.99     22466
          16       0.99      0.99      0.99      2995
          17       0.98      0.99      0.99      8129
          18       1.00      1.00      1.00      6572
          19       1.00      0.99      1.00      1796
          20       0.99      0.99      0.99     10751
          21       0.99      0.99      0.99     13330
          22       0.99      0.99      0.99      4141
          23       1.00      0.99      1.00     43579
          24       1.00      0.99      0.99      2589
          25       0.99      0.99      0.99     20509
          26       1.00      1.00      1.00     13946
          27       0.99      0.99      0.99     47826
          28       0.99      0.99      0.99     15218
          29       0.99      0.99      0.99      4341

    accuracy                           0.99    593657
   macro avg       0.99      0.99      0.99    593657
weighted avg       0.99      0.99      0.99    593657

# Compute the confusion matrix
cm = confusion_matrix(y_test, rf_pred)
class_names = le.classes_  # array of class labels

plt.figure(figsize=(20, 16))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

# Get feature importances from the Random Forest model
importances = rf_clf.feature_importances_
feature_names = X_train.columns

# Sort the features by importance and select the top 20
indices = np.argsort(importances)[::-1]
top_n = 20
top_indices = indices[:top_n]

plt.figure(figsize=(12, 8))
sns.barplot(x=importances[top_indices], y=feature_names[top_indices])
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

	vin	back_legroom	bed	bed_height	bed_length	body_type	cabin	city	city_fuel_economy	combine_fuel_economy	...	transmission	transmission_display	trimId	trim_name	vehicle_damage_category	wheel_system	wheel_system_display	wheelbase	width	year
0	ZACNJABB5KPJ92081	35.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	Bayamon	NaN	NaN	...	A	9-Speed Automatic Overdrive	t83804	Latitude FWD	NaN	FWD	Front-Wheel Drive	101.2 in	79.6 in	2019
1	SALCJ2FX1LH858117	38.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	...	A	9-Speed Automatic Overdrive	t86759	S AWD	NaN	AWD	All-Wheel Drive	107.9 in	85.6 in	2020
2	JF1VA2M67G9829723	35.4 in	NaN	NaN	NaN	Sedan	NaN	Guaynabo	17.0	NaN	...	M	6-Speed Manual	t58994	Base	NaN	AWD	All-Wheel Drive	104.3 in	78.9 in	2016
3	SALRR2RV0L2433391	37.6 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	...	A	8-Speed Automatic Overdrive	t86074	V6 HSE AWD	NaN	AWD	All-Wheel Drive	115 in	87.4 in	2020
4	SALCJ2FXXLH862327	38.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	...	A	9-Speed Automatic Overdrive	t86759	S AWD	NaN	AWD	All-Wheel Drive	107.9 in	85.6 in	2020

	city_fuel_economy	combine_fuel_economy	daysonmarket	engine_displacement	highway_fuel_economy	horsepower	is_certified	latitude	listing_id	longitude	mileage	owner_count	price	savings_amount	seller_rating	sp_id	vehicle_damage_category	year
count	2.508755e+06	0.0	3.000040e+06	2.827654e+06	2.508755e+06	2.827654e+06	0.0	3.000040e+06	3.000040e+06	3.000040e+06	2.855653e+06	1.483027e+06	3.000040e+06	3.000040e+06	2.959168e+06	2.999944e+06	0.0	3.000040e+06
mean	2.269325e+01	NaN	7.605973e+01	2.968488e+03	2.947337e+01	2.479957e+02	NaN	3.698414e+01	2.754987e+08	-9.064224e+01	3.114690e+04	1.532644e+00	2.993337e+04	5.509768e+02	4.270413e+00	2.335224e+05	NaN	2.017728e+03
std	8.807024e+00	NaN	1.088839e+02	1.348905e+03	7.769252e+00	9.046639e+01	NaN	4.996819e+00	8.894123e+06	1.390589e+01	7.458675e+04	9.202928e-01	1.956617e+04	1.079448e+03	5.133017e-01	1.323221e+05	NaN	4.178701e+00
min	7.000000e+00	NaN	0.000000e+00	7.000000e+02	1.000000e+01	5.500000e+01	NaN	1.834670e+01	1.994620e+07	-1.579280e+02	0.000000e+00	1.000000e+00	1.650000e+02	0.000000e+00	1.000000e+00	4.159300e+04	NaN	1.915000e+03
25%	1.800000e+01	NaN	1.400000e+01	2.000000e+03	2.500000e+01	1.750000e+02	NaN	3.350920e+01	2.745794e+08	-9.708820e+01	6.000000e+00	1.000000e+00	1.845100e+04	0.000000e+00	4.000000e+00	6.337500e+04	NaN	2.017000e+03
50%	2.100000e+01	NaN	3.500000e+01	2.500000e+03	2.900000e+01	2.440000e+02	NaN	3.784710e+01	2.785453e+08	-8.724950e+01	8.267000e+03	1.000000e+00	2.647700e+04	0.000000e+00	4.341463e+00	2.816270e+05	NaN	2.020000e+03
75%	2.600000e+01	NaN	8.200000e+01	3.600000e+03	3.300000e+01	3.000000e+02	NaN	4.100620e+01	2.804553e+08	-8.045490e+01	4.366200e+04	2.000000e+00	3.822000e+04	7.850000e+02	4.605263e+00	3.366140e+05	NaN	2.020000e+03
max	1.270000e+02	NaN	3.599000e+03	8.400000e+03	1.270000e+02	1.001000e+03	NaN	6.120310e+01	2.820222e+08	-6.607850e+01	9.999999e+07	1.900000e+01	3.299995e+06	1.474140e+05	5.000000e+00	4.409510e+05	NaN	2.021000e+03

	back_legroom	body_type	city	city_fuel_economy	daysonmarket	engine_cylinders	engine_displacement	franchise_dealer	front_legroom	fuel_tank_volume	...	mileage	owner_count	price	transmission	wheel_system	wheelbase	width	year	make_name
0	35.1 in	SUV / Crossover	Bayamon	NaN	522	I4	1300.0	True	41.2 in	12.7 gal	...	7.0	NaN	23141.0	A	FWD	101.2 in	79.6 in	2019	Jeep
1	38.1 in	SUV / Crossover	San Juan	NaN	207	I4	2000.0	True	39.1 in	17.7 gal	...	8.0	NaN	46500.0	A	AWD	107.9 in	85.6 in	2020	Land Rover
2	35.4 in	Sedan	Guaynabo	17.0	1233	H4	2500.0	True	43.3 in	15.9 gal	...	NaN	3.0	46995.0	M	AWD	104.3 in	78.9 in	2016	Subaru
3	37.6 in	SUV / Crossover	San Juan	NaN	196	V6	3000.0	True	39 in	23.5 gal	...	11.0	NaN	67430.0	A	AWD	115 in	87.4 in	2020	Land Rover
4	38.1 in	SUV / Crossover	San Juan	NaN	137	I4	2000.0	True	39.1 in	17.7 gal	...	7.0	NaN	48880.0	A	AWD	107.9 in	85.6 in	2020	Land Rover

	city_fuel_economy	daysonmarket	engine_displacement	highway_fuel_economy	horsepower	mileage	owner_count	price	savings_amount	year
count	2.508755e+06	3.000040e+06	2.827654e+06	2.508755e+06	2.827654e+06	2.855653e+06	1.483027e+06	3.000040e+06	3.000040e+06	3.000040e+06
mean	2.269325e+01	7.605973e+01	2.968488e+03	2.947337e+01	2.479957e+02	3.114690e+04	1.532644e+00	2.993337e+04	5.509768e+02	2.017728e+03
std	8.807024e+00	1.088839e+02	1.348905e+03	7.769252e+00	9.046639e+01	7.458675e+04	9.202928e-01	1.956617e+04	1.079448e+03	4.178701e+00
min	7.000000e+00	0.000000e+00	7.000000e+02	1.000000e+01	5.500000e+01	0.000000e+00	1.000000e+00	1.650000e+02	0.000000e+00	1.915000e+03
25%	1.800000e+01	1.400000e+01	2.000000e+03	2.500000e+01	1.750000e+02	6.000000e+00	1.000000e+00	1.845100e+04	0.000000e+00	2.017000e+03
50%	2.100000e+01	3.500000e+01	2.500000e+03	2.900000e+01	2.440000e+02	8.267000e+03	1.000000e+00	2.647700e+04	0.000000e+00	2.020000e+03
75%	2.600000e+01	8.200000e+01	3.600000e+03	3.300000e+01	3.000000e+02	4.366200e+04	2.000000e+00	3.822000e+04	7.850000e+02	2.020000e+03
max	1.270000e+02	3.599000e+03	8.400000e+03	1.270000e+02	1.001000e+03	9.999999e+07	1.900000e+01	3.299995e+06	1.474140e+05	2.021000e+03

	back_legroom	city_fuel_economy	daysonmarket	engine_cylinders	engine_displacement	franchise_dealer	front_legroom	fuel_tank_volume	height	highway_fuel_economy	...	length	maximum_seating	mileage	owner_count	price	savings_amount	wheelbase	width	year	city_freq
count	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	...	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06	2.968284e+06
mean	3.800007e+01	2.239953e+01	7.571930e+01	5.086334e+00	2.941816e+03	8.133187e-01	4.219773e+01	1.857000e+01	6.593519e+01	2.937974e+01	...	1.935921e+02	5.459802e+00	2.981266e+04	1.257736e+00	2.973176e+04	5.428629e+02	1.151673e+02	7.834571e+01	2.017781e+03	4.240457e+03
std	3.054381e+00	7.863222e+00	1.079200e+02	1.447159e+00	1.315140e+03	3.896555e-01	1.465058e+00	5.325202e+00	7.350079e+00	6.921976e+00	...	1.970834e+01	1.150393e+00	7.306362e+04	6.889402e-01	1.771610e+04	9.973965e+02	1.418191e+01	7.490284e+00	4.052912e+00	6.407033e+03
min	0.000000e+00	8.000000e+00	0.000000e+00	2.000000e+00	7.000000e+02	0.000000e+00	0.000000e+00	1.900000e+00	4.370000e+01	1.000000e+01	...	1.420000e+02	2.000000e+00	0.000000e+00	1.000000e+00	1.650000e+02	0.000000e+00	8.340000e+01	5.500000e+01	1.915000e+03	1.000000e+00
25%	3.620000e+01	1.800000e+01	1.400000e+01	4.000000e+00	2.000000e+03	1.000000e+00	4.110000e+01	1.480000e+01	5.830000e+01	2.500000e+01	...	1.821000e+02	5.000000e+00	6.000000e+00	1.000000e+00	1.849500e+04	0.000000e+00	1.063000e+02	7.250000e+01	2.017000e+03	8.360000e+02
50%	3.820000e+01	2.100000e+01	3.500000e+01	4.000000e+00	2.500000e+03	1.000000e+00	4.200000e+01	1.770000e+01	6.610000e+01	2.900000e+01	...	1.900000e+02	5.000000e+00	7.794000e+03	1.000000e+00	2.644800e+04	0.000000e+00	1.112000e+02	7.730000e+01	2.020000e+03	2.098000e+03
75%	3.970000e+01	2.500000e+01	8.200000e+01	6.000000e+00	3.500000e+03	1.000000e+00	4.300000e+01	2.100000e+01	7.000000e+01	3.200000e+01	...	2.000000e+02	6.000000e+00	4.128300e+04	1.000000e+00	3.805000e+04	7.820000e+02	1.181000e+02	8.250000e+01	2.020000e+03	5.043000e+03
max	5.980000e+01	1.270000e+02	3.599000e+03	1.200000e+01	8.400000e+03	1.000000e+00	6.700000e+01	6.400000e+01	1.176000e+02	1.220000e+02	...	2.941000e+02	1.500000e+01	9.999999e+07	1.900000e+01	2.698500e+06	1.300210e+05	2.045000e+02	1.090000e+02	2.021000e+03	4.317500e+04

Project 2: Classification¶

1. Problem Introduction¶

1.1 Objectives¶

1.2 Questions to Answer¶

2. Data Introduction¶

3. Data Pre-Processing¶

3.0 Python Imports¶

3.1 Data Preview¶

3.2 Trends¶

3.3 Pre-Processing Steps¶

3.3.1 Column Removal¶

3.3.2 Rare Brand Removal¶

3.3.3 Data Type Conversion¶

3.3.4 Impute Missing Values¶

3.3.5 Categorical Encoding¶

We see now that `df_encoded` has no null values, no string values (with the exception of target `make_name`). It also takes up less memory while retaining nearly the original 3 million rows!¶

4. Data Understanding/Visualization¶

4.1 Frequency Distribution of Car Brands¶

4.2 Correlation Heatmap of Continuous Features¶

4.3 Distribution Plots Price and Mileage¶

4.6 Analysis of Visualized Trends¶

5. Modeling¶

6. Evaluation¶

7. Storytelling¶

8. Impact¶

9. References¶

	back_legroom	city_fuel_economy	daysonmarket	engine_cylinders	engine_displacement	franchise_dealer	front_legroom	fuel_tank_volume	height	highway_fuel_economy	...	fuel_type_Gasoline	fuel_type_Hybrid	fuel_type_Propane	transmission_CVT	transmission_Dual Clutch	transmission_M	wheel_system_4X2	wheel_system_AWD	wheel_system_FWD	wheel_system_RWD
0	35.1	21.0	522	4.0	1300.0	1	41.2	12.7	66.5	29.0	...	True	False	False	False	False	False	False	False	True	False
1	38.1	21.0	207	4.0	2000.0	1	39.1	17.7	68.0	29.0	...	True	False	False	False	False	False	False	True	False	False
2	35.4	17.0	1233	4.0	2500.0	1	43.3	15.9	58.1	23.0	...	True	False	False	False	False	True	False	True	False	False
3	37.6	21.0	196	6.0	3000.0	1	39.0	23.5	73.0	29.0	...	True	False	False	False	False	False	False	True	False	False
4	38.1	21.0	137	4.0	2000.0	1	39.1	17.7	68.0	29.0	...	True	False	False	False	False	False	False	True	False	False

Project 2: Classification¶

1. Problem Introduction¶

1.1 Objectives¶

1.2 Questions to Answer¶

2. Data Introduction¶

3. Data Pre-Processing¶

3.0 Python Imports¶

3.1 Data Preview¶

3.2 Trends¶

3.3 Pre-Processing Steps¶

3.3.1 Column Removal¶

3.3.2 Rare Brand Removal¶

3.3.3 Data Type Conversion¶

3.3.4 Impute Missing Values¶

3.3.5 Categorical Encoding¶

We see now that df_encoded has no null values, no string values (with the exception of target make_name). It also takes up less memory while retaining nearly the original 3 million rows!¶

4. Data Understanding/Visualization¶

4.1 Frequency Distribution of Car Brands¶

4.2 Correlation Heatmap of Continuous Features¶

4.3 Distribution Plots Price and Mileage¶

4.6 Analysis of Visualized Trends¶

5. Modeling¶

6. Evaluation¶

7. Storytelling¶

8. Impact¶

9. References¶

We see now that `df_encoded` has no null values, no string values (with the exception of target `make_name`). It also takes up less memory while retaining nearly the original 3 million rows!¶