data:image/s3,"s3://crabby-images/039c6/039c6100f520838ad35a86fd25dd00ae00556526" alt=""
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as pl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('F:\kaggleDataSet\MedicalCostPersonal\insurance.csv')
data:image/s3,"s3://crabby-images/5ae56/5ae56de1fcd504387d3de347f24bb4f6735bfe86" alt=""
data:image/s3,"s3://crabby-images/b70e6/b70e62e322e95f8e3ea0e799756e1d4beefb8ce7" alt=""
from sklearn.preprocessing import LabelEncoder
#sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates())
data.sex = le.transform(data.sex)
# smoker or not
le.fit(data.smoker.drop_duplicates())
data.smoker = le.transform(data.smoker)
#region
le.fit(data.region.drop_duplicates())
data.region = le.transform(data.region)
data.corr()['charges'].sort_values()
data:image/s3,"s3://crabby-images/65eb0/65eb05005a6916be2532e232abf54a7bd66bf9ba" alt=""
f, ax = pl.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),square=True, ax=ax)
data:image/s3,"s3://crabby-images/584f4/584f4239578acc92dfa11f71372c8bd86232200f" alt=""
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()
import scipy.special
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
p = figure(title="Distribution of charges",tools="save",background_fill_color="#E8DDCB")
hist, edges = np.histogram(data.charges)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
p.xaxis.axis_label = 'x'
p.yaxis.axis_label = 'Pr(x)'
show(gridplot(p,ncols = 2, plot_width=400, plot_height=400, toolbar_location=None))
data:image/s3,"s3://crabby-images/0384f/0384f69fdc2db498b6e108522df8bfbafdc9ed9c" alt=""
f= pl.figure(figsize=(12,5))
ax=f.add_subplot(121)
sns.distplot(data[(data.smoker == 1)]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for smokers')
ax=f.add_subplot(122)
sns.distplot(data[(data.smoker == 0)]['charges'],color='b',ax=ax)
ax.set_title('Distribution of charges for non-smokers')
data:image/s3,"s3://crabby-images/5795e/5795e9a26392ac211912df81f2767811215d3e72" alt=""
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="pink", data=data)
data:image/s3,"s3://crabby-images/c25bd/c25bdb6b7ed290cf73dfb4faa0a4692e023ec4ba" alt=""
sns.catplot(x="sex", y="charges", hue="smoker",kind="violin", data=data, palette = 'magma')
data:image/s3,"s3://crabby-images/a4da7/a4da76d20e78a3cb1c2cf7c92e3a534eb2ea0d07" alt=""
pl.figure(figsize=(12,5))
pl.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data = data[(data.sex == 1)] , orient="h", palette = 'magma')
data:image/s3,"s3://crabby-images/68135/681352df0c34720a2cabed9e7bf39a4d2e501d93" alt=""
pl.figure(figsize=(12,5))
pl.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data = data[(data.sex == 0)] , orient="h", palette = 'rainbow')
data:image/s3,"s3://crabby-images/b52f8/b52f8ba7a77660f26e2e5fc6ce47db048a8f8177" alt=""
pl.figure(figsize=(12,5))
pl.title("Distribution of age")
ax = sns.distplot(data["age"], color = 'g')
data:image/s3,"s3://crabby-images/acb32/acb327f2294a299e952fa64800add693d73347b0" alt=""
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", data=data[(data.age == 18)])
pl.title("The number of smokers and non-smokers (18 years old)")
data:image/s3,"s3://crabby-images/100ae/100ae2eaf5ec0a4ecfa1d25efe9e52398072726f" alt=""
data:image/s3,"s3://crabby-images/91892/918928499712e7a39c4b1e146c3dcf431d17dddf" alt=""
g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 0)],kind="kde", color="m")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title('Distribution of charges and age for non-smokers')
data:image/s3,"s3://crabby-images/29195/2919536ed5148a170f142dc5a1ee9a8513165044" alt=""
g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 1)],kind="kde", color="c")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title('Distribution of charges and age for smokers')
data:image/s3,"s3://crabby-images/c278a/c278a228ed39b930066d0b6e60f7f0babfb33d98" alt=""
#non - smokers
p = figure(plot_width=500, plot_height=450)
p.circle(x=data[(data.smoker == 0)].age,y=data[(data.smoker == 0)].charges, size=7, line_color="navy", fill_color="pink", fill_alpha=0.9)
show(p)
data:image/s3,"s3://crabby-images/1c417/1c417a81e1573e1289da5aa472b79631da13b8be" alt=""
data:image/s3,"s3://crabby-images/6e9af/6e9af2e361f61e7fcc507a260dc9e15f46cd4157" alt=""
#smokers
p = figure(plot_width=500, plot_height=450)
p.circle(x=data[(data.smoker == 1)].age,y=data[(data.smoker == 1)].charges, size=7, line_color="navy", fill_color="red", fill_alpha=0.9)
show(p)
data:image/s3,"s3://crabby-images/da21f/da21f2eb97e543d56c3076c4fb5ff0af46db6fd8" alt=""
data:image/s3,"s3://crabby-images/e3f74/e3f74be04eb69d7b876b40509ce9adb019fa718c" alt=""
sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'inferno_r', size = 7)
ax.set_title('Smokers and non-smokers')
data:image/s3,"s3://crabby-images/57064/5706465811a9634974078c5bcc5ba5a59bbaf5cc" alt=""
pl.figure(figsize=(12,5))
pl.title("Distribution of bmi")
ax = sns.distplot(data["bmi"], color = 'm')
data:image/s3,"s3://crabby-images/58743/587437a49c67200502fd8f81ca86bad0c8988a6f" alt=""
pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI greater than 30")
ax = sns.distplot(data[(data.bmi >= 30)]['charges'], color = 'm')
data:image/s3,"s3://crabby-images/42143/42143826d63f3a81abd5c0b4eba34f2c7c974365" alt=""
pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI less than 30")
ax = sns.distplot(data[(data.bmi < 30)]['charges'], color = 'b')
data:image/s3,"s3://crabby-images/86f4f/86f4f88d21bb4b01943e7636ce92e8412f0fa7a5" alt=""
g = sns.jointplot(x="bmi", y="charges", data = data,kind="kde", color="r")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title('Distribution of bmi and charges')
data:image/s3,"s3://crabby-images/3aac8/3aac8167b70cc167a8e6e39d40d78049e5f16d24" alt=""
pl.figure(figsize=(10,6))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi')
sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma', size = 8)
data:image/s3,"s3://crabby-images/df0a6/df0a61ededdb1b018f3c1dd36879b9dc37d369e1" alt=""
data:image/s3,"s3://crabby-images/29307/29307e4c8a4bb1084b31c41bf455caac8a625e2b" alt=""
sns.catplot(x="children", kind="count", palette="ch:.25", data=data, size = 6)
data:image/s3,"s3://crabby-images/d0f9c/d0f9c4e2f05cbce91c67ee61e348274a5a3ac2cb" alt=""
sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
data=data[(data.children > 0)], size = 6)
ax.set_title('Smokers and non-smokers who have childrens')
data:image/s3,"s3://crabby-images/a4bf5/a4bf5f9e101c0bac22652e6b2fdfe9720f25abf0" alt=""
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
x = data.drop(['charges'], axis = 1)
y = data.charges
x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 0)
lr = LinearRegression().fit(x_train,y_train)
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)
print(lr.score(x_test,y_test))
data:image/s3,"s3://crabby-images/26cf0/26cf050857d55d6b03436932b1bfcb01b4b24d54" alt=""
X = data.drop(['charges','region'], axis = 1)
Y = data.charges
quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X)
X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0)
plr = LinearRegression().fit(X_train,Y_train)
Y_train_pred = plr.predict(X_train)
Y_test_pred = plr.predict(X_test)
print(plr.score(X_test,Y_test))
data:image/s3,"s3://crabby-images/10514/10514835d2f688d825c092ab25e5ce91baab98bb" alt=""
forest = RandomForestRegressor(n_estimators = 100,criterion = 'mse',random_state = 1,n_jobs = -1)
forest.fit(x_train,y_train)
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)
print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train,forest_train_pred),
mean_squared_error(y_test,forest_test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train,forest_train_pred),
r2_score(y_test,forest_test_pred)))
data:image/s3,"s3://crabby-images/0a841/0a84193492d9edcd11a96f0203ac8a3fa0a44371" alt=""
pl.figure(figsize=(10,6))
pl.scatter(forest_train_pred,forest_train_pred - y_train,c = 'black', marker = 'o', s = 35, alpha = 0.5,label = 'Train data')
pl.scatter(forest_test_pred,forest_test_pred - y_test,c = 'c', marker = 'o', s = 35, alpha = 0.7,label = 'Test data')
pl.xlabel('Predicted values')
pl.ylabel('Tailings')
pl.legend(loc = 'upper left')
pl.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = 'red')
pl.show()
data:image/s3,"s3://crabby-images/4d0fa/4d0fac97ad4582f708c64ebbad08701010908f06" alt=""