Ce jeu de données contient les prix de vente des maisons de le comté de King (état de Washington), qui comprend notamment la ville de Seattle. Il comprend les maisons vendues entre mai 2014 et mai 2015.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
house_sales = pd.read_csv('../static/home_data.csv')
house_sales.head()
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 20141013T000000 | 221900 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
1 | 6414100192 | 20141209T000000 | 538000 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
2 | 5631500400 | 20150225T000000 | 180000 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
3 | 2487200875 | 20141209T000000 | 604000 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
4 | 1954400510 | 20150218T000000 | 510000 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
len(house_sales)
21613
house_sales.dtypes
id int64 date object price int64 bedrooms int64 bathrooms float64 sqft_living int64 sqft_lot int64 floors float64 waterfront int64 view int64 condition int64 grade int64 sqft_above int64 sqft_basement int64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 sqft_living15 int64 sqft_lot15 int64 dtype: object
house_sales.isnull().sum()
id 0 date 0 price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 sqft_living15 0 sqft_lot15 0 dtype: int64
info
qui combine ces informations :house_sales.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 21613 non-null int64 1 date 21613 non-null object 2 price 21613 non-null int64 3 bedrooms 21613 non-null int64 4 bathrooms 21613 non-null float64 5 sqft_living 21613 non-null int64 6 sqft_lot 21613 non-null int64 7 floors 21613 non-null float64 8 waterfront 21613 non-null int64 9 view 21613 non-null int64 10 condition 21613 non-null int64 11 grade 21613 non-null int64 12 sqft_above 21613 non-null int64 13 sqft_basement 21613 non-null int64 14 yr_built 21613 non-null int64 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null float64 19 sqft_living15 21613 non-null int64 20 sqft_lot15 21613 non-null int64 dtypes: float64(4), int64(16), object(1) memory usage: 3.5+ MB
house_sales['date'] = pd.to_datetime(house_sales['date'])
# Quels sont les champs concernés ?
col_sqft = [col_name for col_name in house_sales.columns if 'sqft' in col_name]
col_sqft
['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
# Pour chaque champ, on effectue la conversion
for col_name in col_sqft:
house_sales[col_name.replace('sqft', 'sqm')] = house_sales[col_name] / 10.764
# On supprime les anciens champs, 'inplace' (sans faire de copie de la DataFrame)
house_sales.drop(col_sqft, axis=1, inplace=True)
house_sales['renovated'] = np.where(house_sales.yr_renovated > 0, True, False)
house_sales.head()
id | date | price | bedrooms | bathrooms | floors | waterfront | view | condition | grade | ... | zipcode | lat | long | sqm_living | sqm_lot | sqm_above | sqm_basement | sqm_living15 | sqm_lot15 | renovated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 2014-10-13 | 221900 | 3 | 1.00 | 1.0 | 0 | 0 | 3 | 7 | ... | 98178 | 47.5112 | -122.257 | 109.624675 | 524.897808 | 109.624675 | 0.000000 | 124.489038 | 524.897808 | False |
1 | 6414100192 | 2014-12-09 | 538000 | 3 | 2.25 | 2.0 | 0 | 0 | 3 | 7 | ... | 98125 | 47.7210 | -122.319 | 238.758826 | 672.798216 | 201.597919 | 37.160907 | 157.004831 | 709.680416 | True |
2 | 5631500400 | 2015-02-25 | 180000 | 2 | 1.00 | 1.0 | 0 | 0 | 3 | 6 | ... | 98028 | 47.7379 | -122.233 | 71.534745 | 929.022668 | 71.534745 | 0.000000 | 252.694166 | 748.978075 | False |
3 | 2487200875 | 2014-12-09 | 604000 | 4 | 3.00 | 1.0 | 0 | 0 | 5 | 7 | ... | 98136 | 47.5208 | -122.393 | 182.088443 | 464.511334 | 97.547380 | 84.541063 | 126.347083 | 464.511334 | False |
4 | 1954400510 | 2015-02-18 | 510000 | 3 | 2.00 | 1.0 | 0 | 0 | 3 | 8 | ... | 98074 | 47.6168 | -122.045 | 156.075808 | 750.650316 | 156.075808 | 0.000000 | 167.224080 | 697.045708 | False |
5 rows × 22 columns
house_sales.dtypes
id int64 date datetime64[ns] price int64 bedrooms int64 bathrooms float64 floors float64 waterfront int64 view int64 condition int64 grade int64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 sqm_living float64 sqm_lot float64 sqm_above float64 sqm_basement float64 sqm_living15 float64 sqm_lot15 float64 renovated bool dtype: object
house_sales['price'].plot(kind="hist", figsize=(10,4), title="Price of Houses in King's County")
<Axes: title={'center': "Price of Houses in King's County"}, ylabel='Frequency'>
On peut aussi utiliser la bibliothèqueseaborn
et écrire quelques lignes de code supplémentaires pour améliorer le graphique
price_dist = sns.histplot(house_sales["price"], kde=True, bins=200)
price_dist.figure.set_size_inches(10, 4)
price_dist.set(xlabel="Price in Millions", title="Price Density of Houses in King's County")
price_dist
<Axes: title={'center': "Price Density of Houses in King's County"}, xlabel='Price in Millions', ylabel='Count'>
logged_price_dist = sns.histplot(np.log(house_sales["price"]), kde=True)
logged_price_dist.figure.set_size_inches(10,6)
logged_price_dist.set(xlabel="Log Price in Millions", title="Log Price Density of Houses in King's County")
logged_price_dist
<Axes: title={'center': "Log Price Density of Houses in King's County"}, xlabel='Log Price in Millions', ylabel='Count'>
house_sales['floors'].value_counts()
floors 1.0 10680 2.0 8241 1.5 1910 3.0 613 2.5 161 3.5 8 Name: count, dtype: int64
Mais le résultat est trié par le compte par étage au lieu d'être trié par le nombre d'étage.. on va y remédier...
count_by_floor = house_sales['floors'].value_counts()[house_sales['floors'].sort_values().unique()]
count_by_floor
floors 1.0 10680 1.5 1910 2.0 8241 2.5 161 3.0 613 3.5 8 Name: count, dtype: int64
count_by_floor.plot(kind="bar", title="House count by number of floors")
<Axes: title={'center': 'House count by number of floors'}, xlabel='floors'>
boxplot
de la bibliothèque seaborn
pour comparer les prix des maisons avec vue sur le front de mer ou sans vue sur le front de mer :sns.boxplot(x=house_sales.waterfront, y=house_sales.price)
<Axes: xlabel='waterfront', ylabel='price'>
Attention, seulement 163 maisons de notre jeu de données sont face à l'eau.
len(house_sales[house_sales.waterfront == 1])
163
ax = sns.boxplot(x=house_sales['grade'], y=house_sales['price'])
ax.figure.set_size_inches(10,6)
ax.set_title('Price distribution across grade levels')
ax
<Axes: title={'center': 'Price distribution across grade levels'}, xlabel='grade', ylabel='price'>
corr
des DataFrame
("Calcul de la corrélation par paire des colonnes, en excluant les valeurs NA/nulles"), en enlevant certaines colonnes :corr = house_sales.loc[:, ~house_sales.columns.isin(['id', 'date', 'lat', 'long'])].corr()
# on aurait aussi pu écrire `corr = house_sales.drop(['id', 'date', 'lat', 'long'], axis=1).corr()`
corr
price | bedrooms | bathrooms | floors | waterfront | view | condition | grade | yr_built | yr_renovated | zipcode | sqm_living | sqm_lot | sqm_above | sqm_basement | sqm_living15 | sqm_lot15 | renovated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
price | 1.000000 | 0.308350 | 0.525138 | 0.256794 | 0.266369 | 0.397293 | 0.036362 | 0.667434 | 0.054012 | 0.126434 | -0.053203 | 0.702035 | 0.089661 | 0.605567 | 0.323816 | 0.585379 | 0.082447 | 0.126092 |
bedrooms | 0.308350 | 1.000000 | 0.515884 | 0.175429 | -0.006582 | 0.079532 | 0.028472 | 0.356967 | 0.154178 | 0.018841 | -0.152668 | 0.576671 | 0.031703 | 0.477600 | 0.303093 | 0.391638 | 0.029244 | 0.018553 |
bathrooms | 0.525138 | 0.515884 | 1.000000 | 0.500653 | 0.063744 | 0.187737 | -0.124982 | 0.664983 | 0.506019 | 0.050739 | -0.203866 | 0.754665 | 0.087740 | 0.685342 | 0.283770 | 0.568634 | 0.087175 | 0.050260 |
floors | 0.256794 | 0.175429 | 0.500653 | 1.000000 | 0.023698 | 0.029444 | -0.263768 | 0.458183 | 0.489319 | 0.006338 | -0.059121 | 0.353949 | -0.005201 | 0.523885 | -0.245705 | 0.279885 | -0.011269 | 0.006260 |
waterfront | 0.266369 | -0.006582 | 0.063744 | 0.023698 | 1.000000 | 0.401857 | 0.016653 | 0.082775 | -0.026161 | 0.092885 | 0.030285 | 0.103818 | 0.021604 | 0.072075 | 0.080588 | 0.086463 | 0.030703 | 0.093294 |
view | 0.397293 | 0.079532 | 0.187737 | 0.029444 | 0.401857 | 1.000000 | 0.045990 | 0.251321 | -0.053440 | 0.103917 | 0.084827 | 0.284611 | 0.074710 | 0.167649 | 0.276947 | 0.280439 | 0.072575 | 0.104062 |
condition | 0.036362 | 0.028472 | -0.124982 | -0.263768 | 0.016653 | 0.045990 | 1.000000 | -0.144674 | -0.361417 | -0.060618 | 0.003026 | -0.058753 | -0.008958 | -0.158214 | 0.174105 | -0.092824 | -0.003406 | -0.060139 |
grade | 0.667434 | 0.356967 | 0.664983 | 0.458183 | 0.082775 | 0.251321 | -0.144674 | 1.000000 | 0.446963 | 0.014414 | -0.184862 | 0.762704 | 0.113621 | 0.755923 | 0.168392 | 0.713202 | 0.119248 | 0.014008 |
yr_built | 0.054012 | 0.154178 | 0.506019 | 0.489319 | -0.026161 | -0.053440 | -0.361417 | 0.446963 | 1.000000 | -0.224874 | -0.346869 | 0.318049 | 0.053080 | 0.423898 | -0.133124 | 0.326229 | 0.070958 | -0.225195 |
yr_renovated | 0.126434 | 0.018841 | 0.050739 | 0.006338 | 0.092885 | 0.103917 | -0.060618 | 0.014414 | -0.224874 | 1.000000 | 0.064357 | 0.055363 | 0.007644 | 0.023285 | 0.071323 | -0.002673 | 0.007854 | 0.999968 |
zipcode | -0.053203 | -0.152668 | -0.203866 | -0.059121 | 0.030285 | 0.084827 | 0.003026 | -0.184862 | -0.346869 | 0.064357 | 1.000000 | -0.199430 | -0.129574 | -0.261190 | 0.074845 | -0.279033 | -0.147221 | 0.064335 |
sqm_living | 0.702035 | 0.576671 | 0.754665 | 0.353949 | 0.103818 | 0.284611 | -0.058753 | 0.762704 | 0.318049 | 0.055363 | -0.199430 | 1.000000 | 0.172826 | 0.876597 | 0.435043 | 0.756420 | 0.183286 | 0.055094 |
sqm_lot | 0.089661 | 0.031703 | 0.087740 | -0.005201 | 0.021604 | 0.074710 | -0.008958 | 0.113621 | 0.053080 | 0.007644 | -0.129574 | 0.172826 | 1.000000 | 0.183512 | 0.015286 | 0.144608 | 0.718557 | 0.007745 |
sqm_above | 0.605567 | 0.477600 | 0.685342 | 0.523885 | 0.072075 | 0.167649 | -0.158214 | 0.755923 | 0.423898 | 0.023285 | -0.261190 | 0.876597 | 0.183512 | 1.000000 | -0.051943 | 0.731870 | 0.194050 | 0.023178 |
sqm_basement | 0.323816 | 0.303093 | 0.283770 | -0.245705 | 0.080588 | 0.276947 | 0.174105 | 0.168392 | -0.133124 | 0.071323 | 0.074845 | 0.435043 | 0.015286 | -0.051943 | 1.000000 | 0.200355 | 0.017276 | 0.070963 |
sqm_living15 | 0.585379 | 0.391638 | 0.568634 | 0.279885 | 0.086463 | 0.280439 | -0.092824 | 0.713202 | 0.326229 | -0.002673 | -0.279033 | 0.756420 | 0.144608 | 0.731870 | 0.200355 | 1.000000 | 0.183192 | -0.002755 |
sqm_lot15 | 0.082447 | 0.029244 | 0.087175 | -0.011269 | 0.030703 | 0.072575 | -0.003406 | 0.119248 | 0.070958 | 0.007854 | -0.147221 | 0.183286 | 0.718557 | 0.194050 | 0.017276 | 0.183192 | 1.000000 | 0.007920 |
renovated | 0.126092 | 0.018553 | 0.050260 | 0.006260 | 0.093294 | 0.104062 | -0.060139 | 0.014008 | -0.225195 | 0.999968 | 0.064335 | 0.055094 | 0.007745 | 0.023178 | 0.070963 | -0.002755 | 0.007920 | 1.000000 |
corr['price'].sort_values()
zipcode -0.053203 condition 0.036362 yr_built 0.054012 sqm_lot15 0.082447 sqm_lot 0.089661 renovated 0.126092 yr_renovated 0.126434 floors 0.256794 waterfront 0.266369 bedrooms 0.308350 sqm_basement 0.323816 view 0.397293 bathrooms 0.525138 sqm_living15 0.585379 sqm_above 0.605567 grade 0.667434 sqm_living 0.702035 price 1.000000 Name: price, dtype: float64
seaborn
:# Avec la palette par défaut...
ax = sns.heatmap(
corr,
vmin=-1,
vmax=1,
center=0,
)
heatmap
mais aussi en utilisant l'API matplotlib
pour spécifier certains aspects (ici l'orientation des labels sur l'axe des abscisses et la taille de la figure)# En ne prenant pas en compte certaines colonnes
# et en choisissant une palette de couleurs
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap="coolwarm",
square=True,
annot=True
)
# On change la position des labels sur l'axe des abscisses
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
)
# Et on change la taille de la figure
ax.figure.set_size_inches(15, 15)
ax
<Axes: >
x = house_sales[['sqm_living']]
y = house_sales['price']
seaborn
et sa fonction regplot
pour créer un scatterplot et afficher une droite de régression :sns.regplot(x=x, y=y)
<Axes: xlabel='sqm_living', ylabel='price'>
La bibliothèque seaborn
se base sur matplotlib, on pourra ainsi passer des paramètres supplémentaires, en utilisant les paramètres de matplotlib
pour personnaliser le graphique produit.
De même, elle se base sur statsmodels
, et certaines arguments permettent d'utiliser un autre modèle de régression que celui utilisé par défaut.
help(sns.regplot)
Help on function regplot in module seaborn.regression: regplot(data=None, *, x=None, y=None, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, seed=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=True, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None) Plot data and a linear regression model fit. There are a number of mutually exclusive options for estimating the regression model. See the :ref:`tutorial <regression_tutorial>` for more information. Parameters ---------- x, y: string, series, or vector array Input variables. If strings, these should correspond with column names in ``data``. When pandas objects are used, axes will be labeled with the series name. data : DataFrame Tidy ("long-form") dataframe where each column is a variable and each row is an observation. x_estimator : callable that maps vector -> scalar, optional Apply this function to each unique value of ``x`` and plot the resulting estimate. This is useful when ``x`` is a discrete variable. If ``x_ci`` is given, this estimate will be bootstrapped and a confidence interval will be drawn. x_bins : int or vector, optional Bin the ``x`` variable into discrete bins and then estimate the central tendency and a confidence interval. This binning only influences how the scatterplot is drawn; the regression is still fit to the original data. This parameter is interpreted either as the number of evenly-sized (not necessary spaced) bins or the positions of the bin centers. When this parameter is used, it implies that the default of ``x_estimator`` is ``numpy.mean``. x_ci : "ci", "sd", int in [0, 100] or None, optional Size of the confidence interval used when plotting a central tendency for discrete values of ``x``. If ``"ci"``, defer to the value of the ``ci`` parameter. If ``"sd"``, skip bootstrapping and show the standard deviation of the observations in each bin. scatter : bool, optional If ``True``, draw a scatterplot with the underlying observations (or the ``x_estimator`` values). fit_reg : bool, optional If ``True``, estimate and plot a regression model relating the ``x`` and ``y`` variables. ci : int in [0, 100] or None, optional Size of the confidence interval for the regression estimate. This will be drawn using translucent bands around the regression line. The confidence interval is estimated using a bootstrap; for large datasets, it may be advisable to avoid that computation by setting this parameter to None. n_boot : int, optional Number of bootstrap resamples used to estimate the ``ci``. The default value attempts to balance time and stability; you may want to increase this value for "final" versions of plots. units : variable name in ``data``, optional If the ``x`` and ``y`` observations are nested within sampling units, those can be specified here. This will be taken into account when computing the confidence intervals by performing a multilevel bootstrap that resamples both units and observations (within unit). This does not otherwise influence how the regression is estimated or drawn. seed : int, numpy.random.Generator, or numpy.random.RandomState, optional Seed or random number generator for reproducible bootstrapping. order : int, optional If ``order`` is greater than 1, use ``numpy.polyfit`` to estimate a polynomial regression. logistic : bool, optional If ``True``, assume that ``y`` is a binary variable and use ``statsmodels`` to estimate a logistic regression model. Note that this is substantially more computationally intensive than linear regression, so you may wish to decrease the number of bootstrap resamples (``n_boot``) or set ``ci`` to None. lowess : bool, optional If ``True``, use ``statsmodels`` to estimate a nonparametric lowess model (locally weighted linear regression). Note that confidence intervals cannot currently be drawn for this kind of model. robust : bool, optional If ``True``, use ``statsmodels`` to estimate a robust regression. This will de-weight outliers. Note that this is substantially more computationally intensive than standard linear regression, so you may wish to decrease the number of bootstrap resamples (``n_boot``) or set ``ci`` to None. logx : bool, optional If ``True``, estimate a linear regression of the form y ~ log(x), but plot the scatterplot and regression model in the input space. Note that ``x`` must be positive for this to work. {x,y}_partial : strings in ``data`` or matrices Confounding variables to regress out of the ``x`` or ``y`` variables before plotting. truncate : bool, optional If ``True``, the regression line is bounded by the data limits. If ``False``, it extends to the ``x`` axis limits. {x,y}_jitter : floats, optional Add uniform random noise of this size to either the ``x`` or ``y`` variables. The noise is added to a copy of the data after fitting the regression, and only influences the look of the scatterplot. This can be helpful when plotting variables that take discrete values. label : string Label to apply to either the scatterplot or regression line (if ``scatter`` is ``False``) for use in a legend. color : matplotlib color Color to apply to all plot elements; will be superseded by colors passed in ``scatter_kws`` or ``line_kws``. marker : matplotlib marker code Marker to use for the scatterplot glyphs. {scatter,line}_kws : dictionaries Additional keyword arguments to pass to ``plt.scatter`` and ``plt.plot``. ax : matplotlib Axes, optional Axes object to draw the plot onto, otherwise uses the current Axes. Returns ------- ax : matplotlib Axes The Axes object containing the plot. See Also -------- lmplot : Combine :func:`regplot` and :class:`FacetGrid` to plot multiple linear relationships in a dataset. jointplot : Combine :func:`regplot` and :class:`JointGrid` (when used with ``kind="reg"``). pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with ``kind="reg"``). residplot : Plot the residuals of a linear regression model. Notes ----- The :func:`regplot` and :func:`lmplot` functions are closely related, but the former is an axes-level function while the latter is a figure-level function that combines :func:`regplot` and :class:`FacetGrid`. It's also easy to combine :func:`regplot` and :class:`JointGrid` or :class:`PairGrid` through the :func:`jointplot` and :func:`pairplot` functions, although these do not directly accept all of :func:`regplot`'s parameters. Examples -------- .. include: ../docstrings/regplot.rst
sns.regplot(
x=x,
y=y,
line_kws={'color': 'red'},
scatter_kws={'alpha': 0.3},
)
<Axes: xlabel='sqm_living', ylabel='price'>
statsmodels
qui propose d'estimer des modèles statistiques et d’effectuer des tests statistiques (en offrant des résumés comparables à ceux qu’on peut obtenir dans R) :import statsmodels.api as sm
model = sm.OLS(y, sm.add_constant(x))
results = model.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: price R-squared: 0.493 Model: OLS Adj. R-squared: 0.493 Method: Least Squares F-statistic: 2.100e+04 Date: Tue, 16 May 2023 Prob (F-statistic): 0.00 Time: 09:33:02 Log-Likelihood: -3.0027e+05 No. Observations: 21613 AIC: 6.005e+05 Df Residuals: 21611 BIC: 6.006e+05 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -4.358e+04 4402.690 -9.899 0.000 -5.22e+04 -3.5e+04 sqm_living 3020.6321 20.843 144.920 0.000 2979.777 3061.487 ============================================================================== Omnibus: 14832.490 Durbin-Watson: 1.983 Prob(Omnibus): 0.000 Jarque-Bera (JB): 546444.709 Skew: 2.824 Prob(JB): 0.00 Kurtosis: 26.977 Cond. No. 523. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn import linear_model
model = linear_model.LinearRegression()
results = model.fit(x, y)
print(results.intercept_, results.coef_)
-43580.74032708525 [3020.63207124]
model.predict([[438]])
/home/mthh/code/presentation-python-r-shs/env/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([1279456.10687803])
plt.scatter(x, y, color='g', alpha=0.3)
plt.plot(x, model.predict(x), color='r')
plt.title('House price vs. sqm living')
plt.xlabel('Living space (sq m)')
plt.ylabel('Price (million dollars)')
plt.show()
import rpy2.rinterface
%load_ext rpy2.ipython
%%R -i house_sales -o my_coef
# ^^ avec -i et le(s) nom(s) de variable(s) Python à utiliser en R
# ^^ avec -o et pour la sortie R vers Python
model <- lm('price ~ sqm_living', house_sales)
print(summary(model))
my_coef <- coef(model)
/home/mthh/code/presentation-python-r-shs/env/lib/python3.10/site-packages/rpy2/robjects/pandas2ri.py:65: UserWarning: Error while trying to convert the column "id". Fall back to string conversion. The error is: integer 7129300520 does not fit '32-bit int' warnings.warn('Error while trying to convert '
Call: lm(formula = "price ~ sqm_living", data = house_sales) Residuals: Min 1Q Median 3Q Max -1476062 -147486 -24043 106182 4362067 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -43580.74 4402.69 -9.899 <2e-16 *** sqm_living 3020.63 20.84 144.920 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 261500 on 21611 degrees of freedom Multiple R-squared: 0.4929, Adjusted R-squared: 0.4928 F-statistic: 2.1e+04 on 1 and 21611 DF, p-value: < 2.2e-16
print(my_coef)
[-43580.74032708 3020.63207124]
import geopandas as gpd
from shapely import Point
geometry = [Point(xy) for xy in zip(house_sales.long, house_sales.lat)]
gdf = gpd.GeoDataFrame(house_sales, crs="EPSG:4326", geometry=geometry)
gdf.plot(color="orange", markersize=0.4, alpha=0.3)
<Axes: >
import contextily as cx
gdf_wm = gdf.to_crs(epsg=3857)
ax = gdf_wm.plot(figsize=(10, 10), color="red", markersize=0.4, alpha=0.5)
cx.add_basemap(ax)
# price per square metre
gdf_wm['ppsqm'] = gdf_wm['price'] / gdf['sqm_living']
ax = gdf_wm.plot('ppsqm', scheme='Quantiles', k=6, figsize=(16,16), legend=True)
cx.add_basemap(ax)
import folium
# Cette fois on ne veut visualiser que les maisons au bord de l'eau
gdf_with_view = gdf[gdf.waterfront == 1]
# On récupére l'emprise du jeu de données
bounds = gdf_with_view.total_bounds
# On calcul les coordonées du centre
center = [
(bounds[2] + bounds[0]) / 2,
(bounds[3] + bounds[1]) / 2,
]
center # Longitude, Latitude, mais Folium veut Latitude, Longitude
[-122.28649999999999, 47.55025]
map = folium.Map(location=center[::-1], tiles="Stamen Terrain", zoom_start=10)
for index, row in gdf_with_view.iterrows():
coordinates = [row['lat'], row['long']]
map.add_child(
folium.Marker(
location=coordinates,
popup=f'''
Prix: {row['price']}$
<br>
Superficie habitable: {round(row['sqm_living'], 1)}m2
''',
icon=folium.Icon(color="blue"),
)
)
map