import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_excel("test.xlsx")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     406 non-null    int64  
 2   Displacement  406 non-null    float64
 3   Horsepower    400 non-null    float64
 4   Weight        406 non-null    int64  
 5   Acceleration  406 non-null    float64
 6   Model_Year    406 non-null    int64  
 7   Origin        406 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.5+ KB


df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 405
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           392 non-null    float64
 1   Cylinders     392 non-null    int64  
 2   Displacement  392 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        392 non-null    int64  
 5   Acceleration  392 non-null    float64
 6   Model_Year    392 non-null    int64  
 7   Origin        392 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 27.6+ KB


from sklearn.preprocessing import OrdinalEncoder

enc=OrdinalEncoder()
df[["Origin"]]=enc.fit_transform(df[["Origin"]])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 405
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           392 non-null    float64
 1   Cylinders     392 non-null    int64  
 2   Displacement  392 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        392 non-null    int64  
 5   Acceleration  392 non-null    float64
 6   Model_Year    392 non-null    int64  
 7   Origin        392 non-null    float64
dtypes: float64(5), int64(3)
memory usage: 27.6 KB


enc.categories_

[array([0., 1., 2., 3., 4., 5., 6.])]


y=df[["MPG"]]
X=df[["Cylinders","Displacement","Horsepower","Weight","Acceleration","Model_Year","Origin"]]


from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=78)


from sklearn.tree import DecisionTreeRegressor
Regression_Trees_model=DecisionTreeRegressor(criterion="poisson",max_depth=3,random_state=78)
Regression_Trees_model.fit(X_train, y_train)
predictions=Regression_Trees_model.predict(X_test)

print(predictions)
print("Length of the predictions array: ", len(predictions))

[29.0325     28.9375     29.0325     16.70714286 23.08135593 29.0325
 18.95849057 13.7734375  36.1974359  16.70714286 27.25       13.7734375
 13.7734375  23.08135593 23.08135593 23.08135593 13.7734375  16.70714286
 18.95849057 23.08135593 18.95849057 29.0325     18.95849057 36.1974359
 16.70714286 18.95849057 29.0325     29.0325     36.1974359  29.0325
 29.0325     28.9375     13.7734375  36.1974359  36.1974359  23.08135593
 23.08135593 18.95849057 13.7734375  16.70714286 18.95849057 28.9375
 36.1974359  29.0325     36.1974359  18.95849057 29.0325     13.7734375
 23.08135593 29.0325     23.08135593 13.7734375  29.0325     13.7734375
 18.95849057 28.9375     23.08135593 23.08135593 36.1974359  18.95849057
 18.95849057 13.7734375  27.25       28.9375     13.7734375  18.95849057
 23.08135593 23.08135593 28.9375     23.08135593 36.1974359  23.08135593
 23.08135593 16.70714286 13.7734375  18.95849057 18.95849057 29.0325
 18.95849057]
Length of the predictions array:  79


y_test


import math  
import sklearn.metrics
mse = sklearn.metrics.mean_squared_error(y_test,predictions)  
rmse = math.sqrt(mse)    
print("The difference between actual and predicted values", rmse)

The difference between actual and predicted values 4.110725703480176


from sklearn.tree import plot_tree
plt.figure(figsize=(10,10), dpi=200)
plot_tree(Regression_Trees_model, feature_names=X.columns);

	MPG
390	34.0
136	31.0
348	23.5
271	18.1
274	27.5
...	...
9	15.0
54	19.0
207	18.0
326	31.3
234	19.0

Regression (Decision) Trees via Scikit-Learn

Learning Regression (Decision) Trees: Scikit-Learn in Python.

Regression (Decision) Trees¶

Example of carbig¶

Growing a regression tree with Scikit-Learn¶

References¶